Here, I analyse the transposable elements (TE) of a world-wide whole genome sampling of Zymoseptoria tritici, as well as the defence mechanism against TE that is RIP.
library(knitr)
library(reticulate)
#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(plotly)
library(cowplot)
library(GGally)
library(ggrepel)
library(pheatmap)
library(ggridges)
library(ggpubr)
library(tidytree)
#library(ggtree)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
library(sp)
library(raster)
#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"
#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
depth_per_gene_dir = paste0(VAR_dir, "2_Depth_per_gene/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
mito_SV = paste0(VAR_dir, "6_Mito_SV/")
chipseq_dir = paste0(VAR_dir, "7_ChipSeq_peaks/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
effectors_annotation_file = "~/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
#metadata_name = "Main_table_from_SQL_Feb_2020"
metadata_name = "Main_table_from_SQL_June_2022_2"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(TERIP=TE_RIP_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
knitr::opts_chunk$set(dev=c('png', 'pdf'))
# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
mutate(Filter = "Mutants_etc"))
##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")
## Metadata of genotyped samples
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()
Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"),
col_names = temp, delim = "\t",
na = "\\N", guess_max = 2000) %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
inner_join(., genotyped_samples) %>%
mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
dplyr::select(ID_file, Continent, Country, Latitude, Longitude, Latitude2, Longitude2,
Sampling_Date, Collection)
temp = read_tsv(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv")) %>%
dplyr::select(ID_file = Sample, Cluster)
Zt_meta = full_join(Zt_meta, temp)
#genotyped_samples %>%
# filter(!(ID_file %in% filtered_samples$ID_file)) %>%
# write_tsv(Zt_list, col_names = F)
#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)
#For clusters
## Simplified color scheme
myColors_clust <- c("#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba",
"#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
names(myColors_clust) = levels(factor(Zt_meta$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors_clust)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors_clust)
## Detailed color scheme
K_colors = c("#0D6E82", #V1 Aus (TAS)
"#49810E", #V10 USA
"#E16684", #V11 North Africa
"#FFBA0A", #V2 Europe
"#C7F1F9", #V3 NZ
"#21C7E8", #V4 Australia (NSW)
"#8FA253", #V5 Uruguay + Argentina
"#650104", #V6 Israel + Turkey
"#DE020A", #V7 Iran
"#2A4908", #V8 Canada
"#B3C186") #V9 Boliva + Chile + Ecuador
names(K_colors) = levels(factor(Zt_meta$Cluster))
Color_Cluster2 = ggplot2::scale_colour_manual(name = "Cluster", values = K_colors)
Fill_Cluster2 = ggplot2::scale_fill_manual(name = "Cluster", values = K_colors)
#
K_colors2 = c("grey",
"#8ECAE6", #V1 Aus (TAS)
"#49810E", #V10 USA
"#E16684", #V11 North Africa
"#FFBA0A", #V2 Europe
"#126782", #V3 NZ
"#219EBC", #V4 Australia (NSW)
"#8FA253", #V5 Uruguay + Argentina
"#650104", #V6 Israel + Turkey
"#DE020A", #V7 Iran
"#2A4908", #V8 Canada
"#B3C186") #V9 Boliva + Chile + Ecuador
myShapes2 <- c(1, 1, 1, 1, 1, 2, 0,
1, 1, 0, 0, 0)
names(K_colors2) = levels(factor(ifelse(is.na(Zt_meta$Cluster), "Hybrid", Zt_meta$Cluster)))
Color_Cluster3 = ggplot2::scale_colour_manual(name = "Cluster", values = K_colors2)
Fill_Cluster3 = ggplot2::scale_fill_manual(name = "Cluster", values = K_colors2)
Shape_Cluster2 = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes2)
## Shapes for clusters
myShapes <- c(1, 1, 1, 1, 2, 0, 1, 1, 0, 0, 0)
names(myShapes) = levels(factor(Zt_meta$Cluster))
Shape_Cluster = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes)
## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
"Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
"Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
"Temperature Annual Range (BIO5-BIO6)",
"Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
"Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
"Annual Precipitation", "Precipitation of Wettest Month",
"Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
"Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
"Precipitation of Warmest Quarter","Precipitation of Coldest Quarter")
#Run on the cluster
#Create bed file with 10kb windows
#(including the last window which can be smaller)
while read chr length temp temp2 temp3; do
start=0;
while [ "$start" -le "$length" ] ; do
if [ "$(($start + 10000))" -le "$length" ] ;
then
echo -e "${chr}\t${start}\t$(($start + 10000))" ;
else echo -e "${chr}\t${start}\t$length" ;
fi ;
start=`expr $start + 10000` ;
done ;
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.fai > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
#GC etc from bedtools nuc
~/Software/bedtools nuc \
-fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
-bed /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
#RIP per window
#(my script makes a summary for all seq in a multifasta, so I tricked it my giving it a single window at a time)
#columns are "CHROM", "Start", "End", "GC", "Product index", "Substrate index", "Composite"
while read chr start end ;
do
echo -e "${chr}\t${start}\t${end}" > temp.bed ;
~/Software/bedtools getfasta -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
-bed temp.bed -fo temp.fa ;
python GC_RIP_per_read_fastq.py --input_format fasta --out temp temp.fa ;
values=$(cat temp.txt | ~/Software/datamash-1.3/datamash transpose | grep "Median" | cut -f 2,3,4,5) ;
echo -e "${chr}\t${start}\t${end}\t$values" \
>> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv ;
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
#Count variants
#columns are CHROM, Start(IN 10KB UNITS), Variant_number
zcat /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz | \
grep -v "#" | awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,int($2/10000)}' | \
~/Software/bedtools groupby -g 1,3 -o count -c 2 > \
/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab
#Gene coverage
#columns are CHROM, Start, End, Nb_overlapping_genes, Coverage_bp_gene, Window_length, Coverage_frac_gene
~/Software/bedtools coverage \
-a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
-b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3 \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab
#Reference TE coverage
#columns are CHROM, Start, End, Nb_overlapping_TEs, Coverage_bp_TE, Window_length, Coverage_frac_TE
~/Software/bedtools coverage \
-a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
-b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.Badet_Oggenfuss_2019.TE.gtf \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab
Previously, based on the study of TE and RIP in fully-assembled Z.tritici genomes, a hypothesis was drawn. The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.
The data plotted here are based on the following steps:
I detected reference and non-reference TE insertions with ngs_te_mapper2. I investigate the possible biases between collections.
#Summarize numbers
#_________________
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" -v | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Ref_TE_numbers.txt
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Non-ref_TE_numbers.txt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*_TE_numbers.txt ../4_TE_RIP/
#Gather per strain files into one big file
#_________________________________________
#cd /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ; fi ; done > Non-ref_all_strains.bed
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ; fi ; done > Ref_all_strains.bed
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*ef_all_strains.bed ../4_TE_RIP/
#ls -d -1 */ > directory_list.txt
for direc in $dir_list ;
do
if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ;
then
awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ;
else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ;
fi ;
done > Non-ref_all_strains_2022.bed
for direc in $dir_list ;
do
if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ;
then
awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ;
else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ;
fi ;
done > Ref_all_strains_2022.bed
TE_qty = full_join(read_delim(paste0(TE_RIP_dir, "Non-ref_TE_numbers_2022.txt"),
col_names = c("ID_file", "Non_ref_insertions"),
delim = " "),
read_delim(paste0(TE_RIP_dir, "Ref_TE_numbers_2022.txt"),
col_names = c("ID_file", "Ref_insertions"),
delim = " ")) %>%
mutate(Total_insertions = Ref_insertions + Non_ref_insertions) %>%
filter(Total_insertions > 0) %>%
right_join(., Zt_meta) %>%
mutate(Cluster = ifelse(is.na(Cluster), "Hybrid", Cluster))
TE_qty %>%
ggplot(aes(x = Non_ref_insertions, y = Ref_insertions, col = Continent)) +
geom_point(alpha = .8) +
theme_light() +
Color_Continent +
theme(legend.position = "None")
I investigate the TE abundance in relation to the sequencing collection, to consider whether there are biases to take into account.
#Collections comparisons
ggplot(TE_qty, aes(x = Total_insertions, col = Collection, fill = Collection)) +
geom_density(alpha = .4) +
theme_light() +
labs(title = "Comparison of whole TE content estimation per collection")
Let’s investigate possible technical biases.
depth = read_tsv(paste0(vcf_dir, "Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.idepth"))
GC = read_delim(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "Estimate", "Median_GC", "Mean_GC"), delim = " ") %>%
dplyr::select(-TE, -Estimate)
biases = inner_join(TE_qty, depth, by = c("ID_file" = "INDV")) %>%
inner_join(GC)
p1 = biases %>%
ggplot(aes(x = Mean_GC, y = Total_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_light()
p2 = biases %>%
ggplot(aes(x = MEAN_DEPTH, y = Total_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_light()
cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, rel_widths = c(1,1.9))
p1 = biases %>%
ggplot(aes(x = MEAN_DEPTH, y = Ref_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_light()
p2 = biases %>%
ggplot(aes(x = MEAN_DEPTH, y = Non_ref_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_light()
cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, rel_widths = c(1,1.9))
temp = biases %>% filter(!is.na(Cluster)) %>% filter(!is.na(Continent)) %>%
filter(!is.na(Total_insertions))%>%
filter(!is.na(MEAN_DEPTH))%>%
filter(!is.na(Median_GC))
model1 = lm(Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC, data = temp)
summary(model1)
##
## Call:
## lm(formula = Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -307.711 -33.709 -1.362 37.705 249.868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 567.2297 65.3460 8.680 < 2e-16 ***
## ClusterV1 45.6478 10.1936 4.478 8.35e-06 ***
## ClusterV10 128.1398 7.6425 16.767 < 2e-16 ***
## ClusterV11 -83.9212 11.2323 -7.471 1.66e-13 ***
## ClusterV2 10.1703 6.1030 1.666 0.095921 .
## ClusterV3 65.7702 11.2161 5.864 6.04e-09 ***
## ClusterV4 32.4357 13.3899 2.422 0.015586 *
## ClusterV5 25.9236 11.2317 2.308 0.021188 *
## ClusterV6 -79.6353 10.2010 -7.807 1.41e-14 ***
## ClusterV7 -71.3924 14.7251 -4.848 1.43e-06 ***
## ClusterV8 47.5169 13.4094 3.544 0.000412 ***
## ClusterV9 45.8904 14.7385 3.114 0.001898 **
## MEAN_DEPTH 3.9614 0.1009 39.246 < 2e-16 ***
## Median_GC -6.5280 1.4730 -4.432 1.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.42 on 1056 degrees of freedom
## Multiple R-squared: 0.7011, Adjusted R-squared: 0.6974
## F-statistic: 190.5 on 13 and 1056 DF, p-value: < 2.2e-16
model2 = lm(Total_insertions ~ Continent + MEAN_DEPTH + Median_GC, data = temp)
summary(model2)
##
## Call:
## lm(formula = Total_insertions ~ Continent + MEAN_DEPTH + Median_GC,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -309.484 -34.943 -1.053 37.048 254.155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 499.1885 58.2041 8.577 < 2e-16 ***
## ContinentAsia -11.4220 56.8060 -0.201 0.841
## ContinentEurope 46.2850 9.7583 4.743 2.39e-06 ***
## ContinentMiddle East -26.2534 11.1716 -2.350 0.019 *
## ContinentNorth America 157.1649 10.5782 14.857 < 2e-16 ***
## ContinentOceania 92.9660 10.8182 8.593 < 2e-16 ***
## ContinentSouth America 73.1153 12.2715 5.958 3.47e-09 ***
## MEAN_DEPTH 4.0470 0.1044 38.781 < 2e-16 ***
## Median_GC -5.9783 1.3129 -4.554 5.88e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.05 on 1061 degrees of freedom
## Multiple R-squared: 0.6815, Adjusted R-squared: 0.6791
## F-statistic: 283.7 on 8 and 1061 DF, p-value: < 2.2e-16
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC
## Model 2: Total_insertions ~ Continent + MEAN_DEPTH + Median_GC
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1056 3127608
## 2 1061 3332809 -5 -205200 13.857 3.915e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model3 = lm(Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC + Collection, data = temp)
summary(model3)
##
## Call:
## lm(formula = Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC +
## Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.989 -19.193 -2.402 17.114 227.916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 152.2514 66.9437 2.274 0.023150 *
## ClusterV1 -26.5866 8.5842 -3.097 0.002006 **
## ClusterV10 83.1622 6.2861 13.230 < 2e-16 ***
## ClusterV11 -63.8739 6.8743 -9.292 < 2e-16 ***
## ClusterV2 36.2213 4.7710 7.592 6.99e-14 ***
## ClusterV3 -1.5532 8.9855 -0.173 0.862800
## ClusterV4 65.0074 8.5497 7.603 6.43e-14 ***
## ClusterV5 10.4255 7.0077 1.488 0.137128
## ClusterV6 -45.0890 7.1527 -6.304 4.29e-10 ***
## ClusterV7 -36.6321 9.2621 -3.955 8.17e-05 ***
## ClusterV8 70.9785 8.8531 8.017 2.89e-15 ***
## ClusterV9 30.4021 9.0529 3.358 0.000813 ***
## MEAN_DEPTH 1.7115 0.1036 16.526 < 2e-16 ***
## Median_GC 2.1107 1.4917 1.415 0.157377
## CollectionETHZ_2020 90.4311 5.8412 15.482 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -10.6307 6.7641 -1.572 0.116342
## CollectionHartmann_Oregon_2016 163.8927 7.1481 22.928 < 2e-16 ***
## CollectionINRAE_BIOGER 41.6198 4.9551 8.399 < 2e-16 ***
## CollectionJGI 19.7650 8.1813 2.416 0.015869 *
## CollectionJGI_2 41.3786 7.4722 5.538 3.88e-08 ***
## CollectionJGI_3 34.8982 9.0242 3.867 0.000117 ***
## CollectionJGI_4 -57.3218 33.5403 -1.709 0.087742 .
## CollectionJGI_5 58.9987 6.8272 8.642 < 2e-16 ***
## CollectionMMcDonald_2018 129.7312 9.0103 14.398 < 2e-16 ***
## CollectionSyngenta 139.4985 5.7606 24.216 < 2e-16 ***
## CollectionUnine_third_batch_2018 -13.1810 8.8978 -1.481 0.138808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.79 on 1039 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.899, Adjusted R-squared: 0.8965
## F-statistic: 369.8 on 25 and 1039 DF, p-value: < 2.2e-16
The fit is indeed improved by adding the collection information, with both TE quantity estimations.
The non reference TIPs have some sort of support estimation. As a first step, I want to filter the ones that don’t seem reliable.
TE_insertions = bind_rows(read_tsv(paste0(TE_RIP_dir, "Non-ref_all_strains_2022.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score")) %>%
mutate(ref_non_ref_TIP = "non_ref"),
read_tsv(paste0(TE_RIP_dir, "Ref_all_strains_2022.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score")) %>%
mutate(ref_non_ref_TIP = "ref")) %>%
separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
dplyr::select(-c(X1, X2, X3, X4, score))%>%
inner_join(Zt_meta %>% dplyr::select(ID_file, Continent, Cluster, Sampling_Date, Collection)) %>%
filter(!is.na(Continent)) %>% filter(Continent != "Asia") %>%
mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"))
threshold = 0.9
subset_nb = 10000
subset = slice_sample(TE_insertions, n = subset_nb)
nb_non_filtered = nrow(TE_insertions)
nb_filtered = nrow(filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold))
temp = nrow(filter(subset, as.numeric(Allele_Frequency) < threshold))
ggplot(subset, aes(as.numeric(Allele_Frequency))) +
geom_density() +
geom_vline(aes(xintercept = threshold), color = "#82C0CC") +
geom_label(x = 0.5, y = 100, fill = "white",
label = str_wrap(paste0((100*temp/subset_nb), "% TE with allelic frequency below ", threshold,
". \n The dataset goes from ", nb_non_filtered, " detected_TE_insertions to ",
nb_filtered, " after filtering."), 70)) +
theme_light() +
labs(x = "Allele Frequency of the TIPs PAV for each isolate",
title = "Threshold for non-reference TIP filtering")
TE_insertions = filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold)
Once, this is done, I want to explore some basic statistics about the TIPs. In many other studies, it seems that the TIPs SFS is biased toward lower frequency. As shown above, we have a depth bias here, so it is likely that this bias, if found in Z. tritici would be made even stronger by a non-detection artefact in low depth genomes. So in that context, here are some questions of interest: What are the frequency of the TE insertions we found? Are they shared by several samples?
TE_insertions_counts = TE_insertions %>%
unite(Continent, ID_file, col = "for_display", remove = F) %>%
dplyr::count(TE_insertion, ref_non_ref_TIP)
# SFS
p1 = ggplot(TE_insertions_counts, aes(x = n)) +
geom_density( col = "#2EC4B6", fill = "#CBF3F0") + theme_light() +
labs(x = "TE insertion count") +
scale_x_continuous(trans = "log10")
data = TE_insertions_counts %>%
mutate(for_bar = case_when(n == 1 ~ "01",
n == 2 ~ "02",
n <= 10 ~ "10",
n <= 100 ~ "100",
n >= 100 ~ "100 +")) %>%
dplyr::count(for_bar)
data$fraction = data$n / sum(data$n)
data$ymax = cumsum(data$fraction) # Compute the cumulative percentages (top of each rectangle)
data$ymin = c(0, head(data$ymax, n=-1)) # Compute the bottom of each rectangle
data$labelPosition <- (data$ymax + data$ymin) / 2
data$label1 <- ifelse(round(data$fraction*100) > 2, paste0(round(data$fraction*100), "%"), "")
data$label2 <- ifelse(round(data$fraction*100) > 2, "", paste0(round(data$fraction*100), "%"))
# Doughnut chart of frequencies
p2 = ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=for_bar)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(2,4)) +
theme_void() +
geom_text( x=3.5, aes(y=labelPosition, label=label1), size=4) +
geom_text( x=4.2, aes(y=labelPosition, label=label2), size=4) +
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
labs(fill = "Number of strains",
title = "Three quarter of all TE insertions are singletons.")
cowplot::plot_grid(p1, p2)
#TE insertions per Superfamily and frequency
TE_insertions_counts %>%
separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
Length = abs(as.numeric(Start) - as.numeric(End)),
TIP_nb = n) %>%
dplyr::count(Order, Superfamily, TIP_nb) %>%
mutate(for_bar = case_when(TIP_nb == 1 ~ "01",
TIP_nb == 2 ~ "02",
TIP_nb <= 10 ~ "10",
TIP_nb <= 100 ~ "100",
TIP_nb >= 100 ~ "100 +")) %>%
ggplot(aes(x = Superfamily, y = n, fill = for_bar)) +
geom_bar(stat= "identity") +
theme_light() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
facet_wrap(vars(Order), scales = "free_x")+
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E"))
Although most TIPs are found a low to very low frequency in our analysis, there are some that are identified in several samples. Among the TE insertions which are found multiple times, are the TIPs shared by isolates from different clusters or are they always from a single cluster?
theshold = 5
#Filtering only insertions found more than 5 times
insertions_per_pop = TE_insertions %>%
filter(!is.na(Cluster)) %>%
dplyr::count(Cluster, TE_insertion, ref_non_ref_TIP) %>%
filter(n > theshold) %>%
pivot_wider(names_from = Cluster, values_from = n) %>%
mutate(n = 1) %>%
dplyr::rowwise() %>%
dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))),
Nb_population = 11 - NA_count)
#Number of TIPS shared by populations
temp2 = insertions_per_pop %>%
dplyr::count(Nb_population, name = "Insertion_number") %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
label = ifelse(Nb_population == 1, paste0("N=", Insertion_number),
ifelse(Nb_population == 11, paste0("N=", Insertion_number), "")))
prop_non_spe = round(sum(temp2$Insertion_number[temp2$Insertion_type != "Population specific"])/sum(temp2$Insertion_number)*100)
ggplot(temp2, aes(x = reorder(as.character(Nb_population), Nb_population),
y = Insertion_number, fill = Insertion_type)) +
geom_bar(stat = "identity") +
geom_text(aes(y = Insertion_number + 30, label = label), size=3) +
geom_label(x = 7, y = max(temp2$Insertion_number)/2,
label = str_wrap(paste0(prop_non_spe, "% of insertions are found in two or more populations"), 20),
fill = "white") +
theme_light() +
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
labs(x = "Number of populations displaying an insertion",
y = "Number of TE insertions",
title = str_wrap(paste0("Two thirds of TE insertions found in more than ",
theshold, " isolates are population-specific."),55))
So far, I’ve looked at TIPs from all TE categories together. I’m curious to see how it matches with the superfamilies and orders of TEs.
temp2 = insertions_per_pop %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate"))) %>%
dplyr::select(TE_insertion, Insertion_type)
#The insertions per pop were filtered for only TIP higher than 5, so temp2 and the following plot as well
right_join(TE_insertions_counts, temp2) %>%
separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
Length = abs(as.numeric(Start) - as.numeric(End))) %>%
dplyr::count(Order, Superfamily, Insertion_type) %>%
ggplot(aes(x = Superfamily, y = n, fill = Insertion_type)) +
geom_bar(stat= "identity") +
theme_light() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
facet_wrap(vars(Order), scales = "free")+
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6"))
Second, I check whether a plot and statistics including the depth of coverage still recovers the difference between groups that I have observed above.
inner_join(filter(TE_insertions_counts, n > 0), TE_insertions) %>%
group_by(ID_file, Continent, Cluster) %>%
dplyr::count() %>%
inner_join(depth, by = c("ID_file" = "INDV")) %>%
ggplot(aes(x = MEAN_DEPTH, y = n)) +
geom_point(aes(shape = Cluster, col = Cluster), alpha = .8, size = 2) +
theme_light() +
labs(title = "Depth bias check TIP comparison between cluster",
subtitle = "All TIPs (population specific and others)",
x = "Mean depth of coverage",
y = paste0("Number of TIP")) +
Color_Cluster + Shape_Cluster
inner_join(filter(TE_insertions_counts, n > 0), TE_insertions) %>%
group_by(ID_file, Continent, Cluster) %>%
dplyr::count() %>%
inner_join(depth, by = c("ID_file" = "INDV")) %>%
ggplot(aes(x = MEAN_DEPTH, y = n, color = Continent)) +
geom_point(alpha = .2, size = 2) +
geom_smooth(aes(), se = F, method = "glm", formula = y~log(x)) +
theme_light() +
labs(title = "Depth bias check TIP comparison between continent",
subtitle = "All TIPs (population specific and others)",
x = "Mean depth of coverage",
y = paste0("Number of TIP")) +
Color_Continent
Let’s compare the TE content per population and per continent. The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounding factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.
Genomes from isolates in Oceania and the Americas seem to contain more TE than those from the Middle-East, in particular.
Let’s see if the same result is obtained with the TE insertions as detected by ngs_te_mapper2.
TE_insertion_wrld_average = TE_qty %>%
dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
pull(avg)
temp = filter(TE_qty, !is.na(Continent) & Continent != "Asia")%>%
inner_join(depth, by = c("ID_file" = "INDV"))
ggplot(temp, aes(MEAN_DEPTH)) + geom_density() + geom_vline(xintercept = 35)
#Cluster
model = lm(Total_insertions ~ Continent, data=filter(temp, MEAN_DEPTH <= 35))
CLD = cld(lsmeans(model, ~ Continent), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
filter(temp, MEAN_DEPTH <= 35) %>%
ggplot(aes(x = Continent, y = Total_insertions, fill = Continent)) +
geom_violin(alpha = .8) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
title = "TE insertions per continent",
subtitle = str_wrap(paste(""), width = 70)) +
geom_text(data = CLD, aes(x = Continent, label = .group, y = 750), color = "black") +
Fill_Continent +
stat_summary(aes(x = Continent, y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")
filter(temp, MEAN_DEPTH <= 35) %>%
group_by(Continent) %>%
summarize(Median_insertions = round(median(Total_insertions, na.rm = T)),
Mean_insertions = round(mean(Total_insertions, na.rm = T)),
Min_insertions = min(Total_insertions, na.rm = T),
Max_insertions = max(Total_insertions, na.rm = T))
## # A tibble: 6 × 5
## Continent Median_insertions Mean_insertions Min_insertions Max_insertions
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Africa 300 304 184 429
## 2 Europe 334 340 209 669
## 3 Middle East 302 282 164 394
## 4 North America 462 456 283 586
## 5 Oceania 373 375 258 513
## 6 South America 375 364 237 476
TE_insertion_wrld_average = TE_qty %>%
dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
pull(avg)
#Cluster
model = lm(Total_insertions ~ Cluster, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%
filter(!is.na(Cluster)) %>%
ggplot(aes(x = Cluster, y = Total_insertions, fill = Cluster)) +
geom_violin(alpha = .8) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
title = "TE insertions per cluster",
subtitle = str_wrap(paste(""), width = 70)) +
geom_text(data = CLD, aes(x = Cluster, label = .group, y = 750), color = "black") +
Fill_Cluster +
stat_summary(aes(x = Cluster, y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")
The pattern is somewhat different with the TE insertions. In this case,
the Oceania samples don’t look particularly high. However, the pattern
with the African and Middle-Eastern samples being lower is even
clearer.
I know for sure that my estimates are biased, either because of depth, GC content, or both. So I want to make some more checks to ensure that I am confident in the results I show. First, I want to try and compare the clusters intra-collection to ensure that the results observed previously can be confirmed within each collection.
temp = TE_qty %>%
filter(Cluster != "NA") %>%
dplyr::count(Collection, Cluster) %>%
mutate(n = ifelse(n < 5, NA, n)) %>%
pivot_wider(names_from = Cluster, values_from = n)%>%
rowwise() %>%
dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))))
temp
## # A tibble: 16 × 14
## # Rowwise:
## Collection Hybrid V2 V10 V11 V5 V6 V7 V8 V9 V4 V1
## <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 Eschikon_… NA 93 NA NA NA NA NA NA NA NA NA
## 2 ETHZ_2020 24 NA 24 14 22 14 7 8 12 NA NA
## 3 Hartmann_… NA 29 50 NA NA 25 NA NA NA 27 NA
## 4 Hartmann_… NA NA 94 NA NA NA NA NA NA NA NA
## 5 INRAE_BIO… 5 102 NA NA NA NA NA NA NA NA NA
## 6 JGI 6 13 6 NA NA NA NA 10 NA NA NA
## 7 JGI_1 NA NA NA NA NA NA NA NA NA NA NA
## 8 JGI_2 11 11 NA NA NA NA NA NA NA NA NA
## 9 JGI_3 6 8 NA NA NA NA 6 NA NA NA NA
## 10 JGI_4 NA NA NA NA NA NA NA NA NA NA NA
## 11 JGI_5 18 11 NA 7 NA NA NA NA NA NA NA
## 12 Megan_McD… NA NA NA NA NA NA NA NA NA 9 NA
## 13 MMcDonald… 21 NA NA NA NA NA NA NA NA NA 40
## 14 Syngenta 7 273 NA NA NA NA NA NA NA NA NA
## 15 Unine_thi… NA 8 NA NA NA NA NA NA NA NA NA
## 16 <NA> NA 6 NA NA NA NA NA NA NA NA NA
## # … with 2 more variables: V3 <int>, NA_count <int>
collections_multiple = temp %>%
filter(NA_count <= 9) %>%
pivot_longer(cols = -c(Collection, NA_count), names_to = "Cluster", values_to = "n") %>%
filter(!is.na(n))
inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%
ggplot() +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
subtitle = str_wrap(paste(""), width = 70)) +
geom_boxplot(aes(x = Cluster, y = Total_insertions, fill = Cluster), alpha = .8) +
Fill_Cluster + Color_Cluster +
labs(title = "TE insertions per collection per cluster") +
facet_wrap(vars(Collection), scales = "free")
We can see that different clusters and continent have a different overall TE content. However, I am curious to see if there is a geographical clustering intrinsic to the TE content that can be uncovered without an a priori sorting of samples.
temp = TE_insertions %>% dplyr::count(Superfamily, ID_file, Order, Continent, name = "Nb_TIP")
p1 = filter(temp, Order == "Class I (retrotransposons)") %>%
ggplot(aes(x = Superfamily,
y = Nb_TIP,
color = Continent)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
labs(y = "TIPs") +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Continent
p2 = filter(temp, Order == "Class II (DNA transposons)")%>%
ggplot(aes(x = Superfamily,
y = Nb_TIP,
color = Continent)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
ylab("TIPs") +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Continent
temp = cowplot::plot_grid(p1 + theme(legend.position = "None", axis.title.x = element_blank()) +
scale_y_continuous(trans = "log10"),
p2 + theme(legend.position = "None") + scale_y_continuous(trans = "log10"),
ncol = 1)
cowplot::plot_grid(temp, get_legend(p1), nrow = 1, rel_widths = c(1, 0.3))
# Per cluster
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
TIP_per_superfamily_per_sample = TE_insertions %>% dplyr::count(Superfamily, ID_file, Order, Cluster, name = "Nb_TIP") %>%
filter(!is.na(Cluster)) %>% group_by(Order, Superfamily) %>%
mutate(General_median = median(Nb_TIP))
filter(TIP_per_superfamily_per_sample, Order == "Class I (retrotransposons)") %>%
filter(General_median >= 8 ) %>%
ggplot(aes(x = reorder_within(Cluster, Nb_TIP, Superfamily, median),
y = Nb_TIP,
color = Cluster)) +
geom_violin(aes(fill = Cluster), alpha = .4) +
geom_boxplot(outlier.shape = NA, width = .1) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(Superfamily), scales = "free") +
labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs") +
scale_x_reordered()
filter(TIP_per_superfamily_per_sample, Order == "Class II (DNA transposons)") %>%
filter(General_median >= 8 ) %>%
ggplot(aes(x = reorder_within(Cluster, Nb_TIP, Superfamily, median),
y = Nb_TIP,
color = Cluster)) +
geom_violin(aes(fill = Cluster), alpha = .4) +
geom_boxplot(outlier.shape = NA, width = .1) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(Superfamily), scales = "free") +
labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs") +
scale_x_reordered()
TIP_per_superfamily_per_sample %>%
group_by(ID_file) %>%
mutate(Nb_TIP_per_ID = sum(Nb_TIP)) %>%
ggplot(aes(reorder_within(x = ID_file, by = Nb_TIP_per_ID, within = Cluster), Nb_TIP, fill = Superfamily)) +
geom_bar(stat = "identity") +
facet_wrap(vars(Cluster), scales = "free_x") +
theme(axis.text.x = element_blank())
TIP_per_superfamily_per_sample %>%
group_by(ID_file) %>%
mutate(Nb_TIP_per_ID = sum(Nb_TIP)) %>%
ggplot(aes(ID_file, Nb_TIP, fill = Superfamily)) +
geom_bar(stat = "identity", position = "fill") +
theme(axis.text.x = element_blank())+
facet_grid(col = vars(Cluster), scales = "free_x", space = "free") +
theme(panel.spacing.x=unit(0.2, "lines"),panel.spacing.y=unit(1, "lines"))
Let’s compare TE content using the Middle-Eastern and African clusters as a baseline reference.
# Using the three Middle-Eastern and African
average_TIP_origin = TIP_per_superfamily_per_sample %>%
filter(Cluster %in% c("V6", "V7", "V11")) %>%
group_by(Superfamily, Order) %>%
summarize(Median_origin = median(Nb_TIP))
# Class I (retrotransposons)
temp = full_join(TIP_per_superfamily_per_sample, average_TIP_origin) %>%
filter(General_median >= 8 ) %>%
filter(!(Cluster %in% c("V6", "V7", "V11"))) %>%
mutate(Ratio_TIP_to_origin = Nb_TIP/Median_origin) %>%
filter(Order == "Class I (retrotransposons)")
temp2 = temp %>% group_by(Cluster, Superfamily) %>%
summarize(Med = median(Ratio_TIP_to_origin))
ggplot() +
geom_violin(data = temp, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = Ratio_TIP_to_origin, fill = Cluster), col = NA, alpha = .3) +
#geom_jitter(data = temp, aes(x = Cluster, y = Ratio_TIP_to_origin, color = Cluster), width = .1, alpha = .1) +
geom_hline(yintercept = 1) +
geom_segment(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
xend = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = 1, yend = Med, color = Cluster), size = 1) +
geom_point(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = Med, color = Cluster), size = 2) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(Superfamily), scales = "free_y") +
labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs ratio to origin") +
scale_x_reordered()
ggsave("Large_TIP_per_family_prop_classI.pdf",
dpi = 500, width = 33, height = 15, units = "cm")
# Class II (DNA transposons)
temp = full_join(TIP_per_superfamily_per_sample, average_TIP_origin) %>%
filter(General_median >= 8 ) %>%
filter(!(Cluster %in% c("V6", "V7", "V11"))) %>%
mutate(Ratio_TIP_to_origin = Nb_TIP/Median_origin) %>%
filter(Order == "Class II (DNA transposons)")
temp2 = temp %>% group_by(Cluster, Superfamily) %>%
summarize(Med = median(Ratio_TIP_to_origin))
ggplot() +
geom_violin(data = temp, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = Ratio_TIP_to_origin, fill = Cluster), col = NA, alpha = .3) +
#geom_jitter(data = temp, aes(x = Cluster, y = Ratio_TIP_to_origin, color = Cluster), width = .1, alpha = .1) +
geom_hline(yintercept = 1) +
geom_segment(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
xend = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = 1, yend = Med, color = Cluster), size = 1) +
geom_point(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"),
y = Med, color = Cluster), size = 2) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(Superfamily), scales = "free_y") +
labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs ratio to origin") +
scale_x_reordered()
ggsave("Large_TIP_per_family_prop_classII.pdf",
dpi = 500, width = 25, height = 15, units = "cm")
Per TE family now, instead of superfamily. Can we identify individual TEs that have spread in the genomes outside of the center of origin?
TIP_per_family_per_sample = TE_insertions %>% dplyr::count(TE_family, Superfamily, ID_file, Order, Cluster, name = "Nb_TIP") %>%
filter(!is.na(Cluster)) %>% group_by(Order, Superfamily, TE_family) %>%
mutate(General_median = median(Nb_TIP))
to_keep = unique(filter(TIP_per_family_per_sample, Nb_TIP > 10)$TE_family)
TIP_per_family_per_sample %>% filter(TE_family %in% to_keep) %>%
filter(Order == "Class I (retrotransposons)") %>%
separate(TE_family, into = c("TE_family_short", "Temp"), remove = F, sep = "-") %>%
ggplot(aes(x = reorder_within(Cluster, by = Nb_TIP, fun = median, within = TE_family_short),
y = Nb_TIP,
color = Cluster)) +
geom_violin(aes(fill = Cluster), alpha = .4) +
geom_boxplot(outlier.shape = NA, width = .1) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(TE_family_short), scales = "free") +
labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs") +
scale_x_reordered()
ggsave("Large_TIP_per_family_per_cluster_classI.pdf",
dpi = 500, width = 30, height = 20, units = "cm")
TIP_per_family_per_sample %>% filter(TE_family %in% to_keep) %>%
filter(Order == "Class II (DNA transposons)") %>%
separate(TE_family, into = c("TE_family_short", "Temp"), remove = F, sep = "-") %>%
ggplot(aes(x = reorder_within(Cluster, by = Nb_TIP, fun = median, within = TE_family_short),
y = Nb_TIP,
color = Cluster)) +
geom_violin(aes(fill = Cluster), alpha = .4) +
geom_boxplot(outlier.shape = NA, width = .1) +
theme_light() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
Color_Cluster + Fill_Cluster +
facet_wrap(vars(TE_family_short), scales = "free") +
labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs") +
scale_x_reordered()
ggsave("Large_TIP_per_family_per_cluster_classII.pdf",
dpi = 500, width = 36, height = 20, units = "cm")
Let’s get some number of the text.
temp = TE_insertions %>%
dplyr::count(TE_family, ID_file, Order, Continent, name = "Nb_TIP") %>%
filter(!is.na(Continent) & Continent != "Asia")
x = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
filter(TE_family == "RLC_Deimos_consensus-1") %>% pull(Nb_TIP) %>% mean()
paste0("The mean number of RLC Deimos per isolate in the center of origin is ", x)
## [1] "The mean number of RLC Deimos per isolate in the center of origin is 11.3137254901961"
x = filter(temp, !(Continent %in% c("Africa", "Middle East")))%>%
filter(TE_family == "RLC_Deimos_consensus-1") %>% pull(Nb_TIP) %>% mean()
paste0("The mean number of RLC Deimos per isolate outside the center of origin is ", x)
## [1] "The mean number of RLC Deimos per isolate outside the center of origin is 50.784375"
x = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% median()
y = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% mean()
z = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% max()
print(paste(x, y, z, sep = " "))
## [1] "1 1.77586206896552 17"
paste0("The median number of RLX Styx per isolate in the center of origin is ", x)
## [1] "The median number of RLX Styx per isolate in the center of origin is 1"
x = filter(temp, !(Continent %in% c("Africa", "Middle East")))%>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% median()
y = filter(temp, !(Continent %in% c("Africa", "Middle East"))) %>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% mean()
z = filter(temp, !(Continent %in% c("Africa", "Middle East"))) %>%
filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% max()
print(paste(x, y, z, sep = " "))
## [1] "5 5.79337899543379 26"
paste0("The median number of RLX Styx per isolate outside the center of origin is ", x)
## [1] "The median number of RLX Styx per isolate outside the center of origin is 5"
First, I will use the ngs_te_mapper2 results, and use each insertion identified in more than 5 samples to create different clustering. First, I’ll simply create a heatmap, then I’ll move on to PCAs, first by cluster, then with each isolate.
#PCA per isolate
matrice = TE_insertions %>%
#filter(!is.na(Cluster)) %>%
mutate(Cluster = ifelse(is.na(Cluster), "Hybrid", Cluster)) %>%
inner_join(filter(TE_insertions_counts, n > 10)) %>%
dplyr::count(TE_insertion, ID_file, Cluster) %>%
group_by(ID_file) %>%
mutate(Count_strain = sum(n)) %>%
pivot_wider(names_from = TE_insertion, values_from = n, values_fill = 0)
temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]
TE.pca = prcomp(temp, center = TRUE, scale. = TRUE)
#Plot
temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
dplyr::select(ID_file, Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)
p1 = ggplot(temp, aes(x = PC1, y = PC2, col = Cluster, shape = Cluster)) +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
theme_light()
p2 = ggplot(temp, aes(x = PC3, y = PC4, col = Cluster, shape = Cluster)) +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
theme_light() + theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, col = Cluster, shape = Cluster)) +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
theme_light() + theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, col = Cluster, shape = Cluster)) +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
theme_light() + theme(legend.position = "none")
temp = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, p3, p4)
cowplot::plot_grid(temp, get_legend(p1),
ncol = 2, rel_widths = c(1, .1))
It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.
I want to gain some insights about where, in the genome, are these TIPs we have detected: are they on specific chromosomes? Is there a difference between core and accessory chromosomes? Can we detect hotspots of transposition, i.e., are there places in the genome which tend to have more TIPs than others?
TE_insertions_per_window = TE_insertions %>%
mutate(CHR = as.numeric(CHR),
window = trunc(((position + end)/2)/10000)) %>%
dplyr::select(TE_insertion, CHR, window) %>%
distinct() %>%
group_by(CHR, window) %>%
dplyr::count()
summar = ungroup(TE_insertions_per_window) %>%
summarise(median_TE = median(n),
sd_TE = sd(n))
TE_insertions_per_window = TE_insertions_per_window %>%
#inner_join(summar) %>%
mutate(outlier = ifelse(n >= summar$median_TE + 3*summar$sd_TE, "outlier", "non_outlier"))
TE_insertions_per_window %>%
ungroup() %>%
mutate(Start = window*10000, End = Start + 10000) %>%
dplyr::select(-window, TIP_count = n, TIP_category = outlier) %>%
write_tsv(file = paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"))
filter(TE_insertions_per_window, as.numeric(CHR) < 8) %>%
ggplot(aes(x = window, y = n, col = as.character(outlier))) +
theme_light() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
filter(TE_insertions_per_window, as.numeric(CHR) >= 8 & as.numeric(CHR) < 14) %>%
ggplot(aes(x = window, y = n, col = outlier)) +
theme_light() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
filter(TE_insertions_per_window, as.numeric(CHR) >= 14) %>%
ggplot(aes(x = window, y = n, col = outlier)) +
theme_light() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
label1 = ungroup(TE_insertions_per_window) %>%
filter(!is.na(CHR)) %>%
dplyr::summarize(Average = mean(n)) %>% pull()
p1 = ungroup(TE_insertions_per_window) %>%
filter(!is.na(CHR)) %>%
group_by(CHR) %>%
dplyr::summarize(Average = mean(n)) %>%
dplyr::mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
ggplot(aes(x = reorder(as.character(CHR), CHR), y = Average, fill = Chromosome_type)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = label1, linetype = "dashed", col = "grey40") +
theme_light() +
scale_fill_manual(values = c("#ffa62b", "#2FC6B7")) +
labs(title = paste0("GW TIP number average is ", round(label1, 2)),
x = "Chromosome", y = "Mean # TIPs per 10kb") +
theme(legend.position = "none")
p2 = ungroup(TE_insertions_per_window) %>%
dplyr::count(CHR, outlier) %>%
group_by(CHR) %>%
dplyr::mutate(Tot_window = sum(n)) %>%
mutate(prop = 100*n/Tot_window) %>%
mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
filter(outlier == "outlier") %>%
ggplot(aes(x = reorder(as.character(CHR), CHR), y = prop, fill = Chromosome_type)) +
geom_bar(stat = "identity") +
theme_light() +
scale_fill_manual(values = c("#ffa62b", "#2FC6B7")) +
labs(title = "Proportion of outlier windows",
x = "Chromosome", y = "% outlier windows") +
theme(legend.position = "none") +
geom_text(aes(y = 22, label = paste0(round(prop, 2), "%")), size = 3)
cowplot::plot_grid(p1 + coord_flip(), p2 + coord_flip())
ungroup(TE_insertions_per_window) %>%
dplyr::count(CHR, outlier) %>%
group_by(CHR) %>%
dplyr::mutate(Tot_window = sum(n)) %>%
mutate(prop = 100*n/Tot_window) %>%
filter(outlier == "outlier")
## # A tibble: 19 × 5
## # Groups: CHR [19]
## CHR outlier n Tot_window prop
## <dbl> <chr> <int> <int> <dbl>
## 1 1 outlier 4 592 0.676
## 2 2 outlier 4 371 1.08
## 3 3 outlier 6 330 1.82
## 4 4 outlier 2 271 0.738
## 5 5 outlier 6 269 2.23
## 6 6 outlier 5 249 2.01
## 7 7 outlier 1 254 0.394
## 8 8 outlier 4 226 1.77
## 9 9 outlier 9 206 4.37
## 10 10 outlier 5 161 3.11
## 11 11 outlier 3 158 1.90
## 12 13 outlier 4 115 3.48
## 13 14 outlier 3 70 4.29
## 14 15 outlier 6 62 9.68
## 15 17 outlier 5 52 9.62
## 16 18 outlier 1 52 1.92
## 17 19 outlier 5 52 9.62
## 18 20 outlier 1 45 2.22
## 19 21 outlier 3 37 8.11
ungroup(TE_insertions_per_window) %>% dplyr::count(outlier)
## # A tibble: 2 × 2
## outlier n
## <chr> <int>
## 1 non_outlier 3697
## 2 outlier 77
I now want to investigate the link between the TIP and the genome compartmentalization in terms of transcriptomics and epigenomics.
# Uploading commands from the cluster to my computer.
#Per windows: coordinates, RIP, GC, SNP counts
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/7_ChipSeq_peaks ../1_Variant_calling/
#This is done for each histone mark
#The data is from Klaas paper's on centromeres
#The peaks were called by macs2 (option --nomodel) after trimming and mapping with trimmomatic and bowtie + filtering of reads mapping with a quality sup to 30
grep -v "#" ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.xls | cut -f 1,2,3,10 | \
grep "peak" > ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.bed
grep -v "#" ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.xls | cut -f 1,2,3,10 | \
grep "peak" > ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.bed
~/Documents/Software/bedtools2/bin/bedtools intersect \
-a ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.bed \
-b ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.bed \
> ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_intersect_only_overlap.bed
#Get length of overlap between genes and windows for RNAseq
~/Documents/Software/bedtools2/bin/bedtools intersect -wo \
-a ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
-b ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.gff3 \
> ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.intersect_genes.txt
#Getting mappability per window
mappability = read_tsv(paste0(data_dir, "Zymoseptoria_tritici_genmap_mappability.bedgraph"),
col_names = c("CHR", "start", "end", "mappab"), col_types = c("c", "d", "d", "d")) %>%
dplyr::mutate(Length = (end - start), window = trunc(((start + end)/2)/10000)) %>%
group_by(CHR, window) %>%
dplyr::summarize(Mappability = weighted.mean(x = mappab, w = Length)) %>%
mutate(Start = window*10000) %>%
ungroup() %>%
dplyr::select(-window)
#Getting the methylation peaks
chipseq = bind_rows(read_tsv(paste0(chipseq_dir, "H3K27me3_intersect_only_overlap.bed"),
col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c")),
read_tsv(paste0(chipseq_dir, "H3K4me2_intersect_only_overlap.bed"),
col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c"))) %>%
bind_rows(read_tsv(paste0(chipseq_dir, "H3K9me2_intersect_only_overlap.bed"),
col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c"))) %>%
separate(col = peak, into = c("Mark"), sep ="_", remove = FALSE, extra = "drop") %>%
dplyr::mutate(Length = (end - start), window = trunc(((start + end)/2)/10000)) %>%
group_by(CHR, window, Mark) %>%
dplyr::summarize(Coverage = sum(Length)/10000) %>%
mutate(Start = window*10000) %>%
ungroup() %>%
dplyr::select(-window) %>%
pivot_wider(names_from = Mark, values_from = Coverage, values_fill = 0)
#RNAseq
TPM = read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.intersect_genes.txt"),
col_names = c("CHR", "Start", "End", "X1", "X2", "X3", "Gene_start", "Gene_end",
"X4", "X5", "X6", "Annot", "Overlap")) %>%
dplyr::select(CHR, Start, End, Annot, Overlap) %>%
separate(col = Annot, into = c("X1", "X2", "X3", "X4","X5", "Protein_ID"), sep = "[=;]") %>%
inner_join(readxl::read_excel(paste0("/Users/afeurtey/Documents/Postdoc_Eva/Manuscripts/",
"Accepted/Alice_Cecile_Comparative_genomics/",
"Data_for_publication/Expression_data_Ztritici_inplanta.xlsx"),
skip = 5, sheet = "IPO323_TPM")) %>%
group_by(CHR, Start, End, Protein_ID) %>%
mutate(Average_bio = mean(c(A_1, A_2, B_1, B_2), na.rm = T),
Average_necro = mean(c(C_1, C_2, D_1, D_2)), na.rm = T,
CHR = as.character(CHR)) %>%
dplyr::select(CHR, Start, End, Protein_ID, Overlap, Average_bio, Average_necro)
TPM_per_window = TPM %>%
filter(Protein_ID != "Zt09_3_00018") %>%
group_by(CHR, Start, End) %>%
dplyr::summarize(TPM_bio = weighted.mean(x = Average_bio, w = Overlap),
TPM_necro = weighted.mean(x = Average_necro, w = Overlap))
#Getting all the data together
data = read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed"),
col_names = c("CHR", "Start", "End")) %>%
full_join(read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab"), skip = 1,
col_names = c("CHR", "Start", "End", "ATpercent", "GCpercent", "NbA", "NbC", "NbG", "NbT",
"NbN", "NbOther", "Window_length"))) %>%
dplyr::select(CHR, Start, End, GCpercent, Window_length) %>%
full_join(read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv"),
col_names = c("CHR", "Start", "End", "GC", "Product_index", "Substrate_index", "Composite_index"),
na = c("", "nan"), delim = "\t")) %>%
full_join(read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab"),
col_names = c("CHR", "Start", "End", "Nb_overlapping_genes", "Coverage_bp_gene", "Window_length", "Coverage_frac_gene"))) %>%
full_join(read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab"),
col_names = c("CHR", "Start", "End", "Nb_overlapping_TEs", "Coverage_bp_TE", "Window_length", "Coverage_frac_TE"))) %>%
full_join(read_delim(paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"),
col_types = c("c","d","c","d","d"))) %>%
full_join(read_tsv(paste0(vcf_dir,
"Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab"),
col_names = c("CHR", "Start", "Short_variant_number"), col_types = c("c","d","d")) %>%
mutate(Start = Start*10000)) %>%
full_join(mappability) %>%
full_join(chipseq) %>%
full_join(TPM_per_window)
The mappability can be a strong issue to detect variants and in particular TIPs, so I want to remove windows with low mappability before doing the PCA.
#Threshold setting, visualizing and data filtering
threshold = 0.85
data_PCA = data %>%
unite(CHR, Start, col = Window, sep = ":") %>%
filter(Window_length > 7500) %>%
#filter(TIP_count < 150) %>%
dplyr::select(Window, GC, Composite_index, Coverage_frac_gene, Coverage_frac_TE,
TIP_count, Short_variant_number, Mappability,
H3K27me3, H3K9me2, H3K4me2, TPM_bio, TPM_necro)
data_PCA[is.na(data_PCA)] <- 0
data_PCA %>%
ggplot(aes(Mappability)) +
geom_density(fill = "#EDE7E3", col ="#ADA7A9") +
geom_vline(xintercept = threshold, col = "#82c0cc") +
theme_light()
data_PCA2 = data_PCA %>%
filter(Mappability > threshold)
#First, density plots of pre and post filtering
pre_post_filter = bind_rows(data_PCA %>%
pivot_longer(cols = -Window, names_to = "Measure", values_to = "Estimate") %>%
mutate(Windows = "All"),
data_PCA2 %>%
pivot_longer(cols = -Window, names_to = "Measure", values_to = "Estimate") %>%
mutate(Windows = "Only mappable")) %>%
filter(Measure != "Mappability") %>%
group_by(Measure) %>%
dplyr::mutate(Med = median(Estimate)) %>%
mutate(Variable_category = case_when(str_detect(Measure, "H3") ~ "Epigenomics",
Measure == "Coverage_frac_TE" ~ "TE related",
str_detect(Measure, "TPM") ~ "Gene related",
Measure == "Coverage_frac_gene" ~ "Gene related",
Measure %in% c("GC", "Composite_index") ~ "TE related",
TRUE ~ "Variant counts"))
#Violing and boxplot of the short variant and TIP counts
pre_post_filter %>%
filter(Variable_category == "Variant counts") %>%
ggplot(aes(y = Estimate, x = Windows,
fill = Windows, col = Windows)) +
geom_violin(alpha = .4) +
geom_boxplot(alpha = .4, width = .2, outlier.shape = NA) +
theme_light() +
theme(panel.grid.major.x = element_blank()) +
labs(y = "Variant count per 10kb window", y = "") +
scale_color_manual(values = c("grey", "#2FC6B7"))+
scale_fill_manual(values = c("grey", "#2FC6B7")) +
facet_wrap(vars(Measure), scales = "free", ncol = 3)
ggsave(paste0(fig_dir, "FigS2_variants_filtered_on_mappability.pdf"), width = 12, height = 12, units = "cm")
Some variables are particularly related to mappability. Here, I will compare the values between the high and low mappability windows.
#Boxplots for a subset of the variables
temp = data %>%
filter(!is.na(Mappability)) %>%
mutate(Mappability = ifelse(Mappability < threshold, "Low", "High"))
#TE coverage
p1 = ggplot(temp, aes(x = Mappability, y = Coverage_frac_TE, col = Mappability, fill = Mappability)) +
geom_boxplot() +
theme_light()+
scale_color_manual(values = c("#ADA7A9", "#ffa62b")) +
scale_fill_manual(values = c("#EDE7E3", "#FFD399"))
#H3K27me3 mark
p2 = ggplot(temp, aes(x = Mappability, y = H3K27me3, col = Mappability, fill = Mappability)) +
geom_boxplot(alpha = .4) +
theme_light() +
scale_color_manual(values = c("#ADA7A9", "#ffa62b")) +
scale_fill_manual(values = c("#EDE7E3", "#FFD399"))
#Gene coverage
p3 = ggplot(temp, aes(x = Mappability, y = Coverage_frac_gene, col = Mappability, fill = Mappability)) +
geom_boxplot() +
theme_light() +
scale_color_manual(values = c("#ADA7A9", "#ffa62b")) +
scale_fill_manual(values = c("#EDE7E3", "#FFD399"))
#Composite index
p4 = data %>%
filter(!is.na(Mappability)) %>%
mutate(Mappability = ifelse(Mappability < threshold, "Low", "High")) %>%
ggplot(aes(x = Mappability, y = Composite_index, col = Mappability, fill = Mappability)) +
geom_boxplot() +
theme_light() +
scale_color_manual(values = c("#ADA7A9", "#ffa62b")) +
scale_fill_manual(values = c("#EDE7E3", "#FFD399"))
cowplot::plot_grid(p1 + theme(legend.position = "none", axis.title.x = element_blank()),
p2 + theme(legend.position = "none", axis.title.x = element_blank()),
p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"),
rel_heights = c(1, 1, 1.25, 1.25))
ggsave(paste0(fig_dir, "FigS2_boxplots_high_low_mappability.pdf"), width = 12, height = 12, units = "cm")
temp = data_PCA2 %>%
dplyr::select(-Mappability)
temp = as.matrix(temp[,c(2:ncol(temp))])[, which(apply(as.matrix(temp[,c(2:ncol(temp))]), 2, var) != 0)]
per_win.pca = prcomp(temp, center = TRUE,scale. = TRUE)
#Making custom PCA plot
arrows = as_tibble(as.data.frame(per_win.pca$rotation)) %>%
mutate(label = rownames(per_win.pca$rotation))
temp = as.tibble(cbind(data_PCA2, as.data.frame(per_win.pca$x)))
for_plot = temp %>%
separate(col = Window, into = c("CHR"), remove = F, extra = "drop", sep = ":") %>%
mutate(CHR_type = ifelse(as.numeric(CHR) < 14, "Core", "Accessory"))
eigs <- per_win.pca$sdev^2
ggplot() +
geom_point(data = for_plot, aes(x = PC1, y = PC2, col = CHR_type), alpha = .6, shape = 1) +
theme_light() +
geom_segment(data = arrows, aes(x = 0, y = 0, xend = PC1*10, yend = PC2*10 ),
arrow = arrow(length = unit(0.03, "npc"))) +
geom_text(data = arrows, aes(x = PC1*10, y = PC2*10, label = label)) +
coord_cartesian(xlim = c(-6, 10), ylim = c(-8, 7)) +
scale_color_manual(values = c("#ffa62b", "#2FC6B7")) +
labs(title = "PCA per 10kb windows",
subtitle = "Two first PCs",
x = paste0("PC1 (", round(100*eigs[1]/sum(eigs), 1), "% explained var.)"),
y = paste0("PC2 (", round(100*eigs[2]/sum(eigs), 1), "% explained var.)"))
temp = Zt_meta %>%
#filter(!(ID_file %in% filtered_samples$ID_file)) %>%
filter(!is.na(Longitude)) %>%
mutate(X = as.numeric(unlist(Longitude)),
Y = as.numeric(unlist(Latitude))) %>%
dplyr::select(X, Y) %>%
distinct()
sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
## min max
## X -122.9300 175.6576
## Y -46.2187 59.3294
## Is projected: NA
## proj4string : [NA]
## Number of points: 307
bio_list = list()
for (i in c(1:length(Bioclim_var))) {
file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
rast_temp = raster(file_name)
bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}
climate_per_coordinates = cbind(temp, do.call(cbind, bio_list))
dat = TE_qty %>%
full_join(., climate_per_coordinates,
by= c("Longitude" = "X", "Latitude" = "Y")) %>%
filter(!is.na(Continent))%>% filter(!is.na(Collection)) %>% filter(!is.na(Cluster)) %>%
filter(!is.na(Total_insertions))
names(dat)
## [1] "ID_file"
## [2] "Non_ref_insertions"
## [3] "Ref_insertions"
## [4] "Total_insertions"
## [5] "Continent"
## [6] "Country"
## [7] "Latitude"
## [8] "Longitude"
## [9] "Latitude2"
## [10] "Longitude2"
## [11] "Sampling_Date"
## [12] "Collection"
## [13] "Cluster"
## [14] "Annual Mean Temperature"
## [15] "Mean Diurnal Range "
## [16] "Isothermality (BIO2/BIO7) (×100)"
## [17] "Temperature Seasonality (standard deviation ×100)"
## [18] "Max Temperature of Warmest Month"
## [19] "Min Temperature of Coldest Month"
## [20] "Temperature Annual Range (BIO5-BIO6)"
## [21] "Mean Temperature of Wettest Quarter"
## [22] "Mean Temperature of Driest Quarter"
## [23] "Mean Temperature of Warmest Quarter"
## [24] "Mean Temperature of Coldest Quarter"
## [25] "Annual Precipitation"
## [26] "Precipitation of Wettest Month"
## [27] "Precipitation of Driest Month"
## [28] "Precipitation Seasonality (Coefficient of Variation)"
## [29] "Precipitation of Wettest Quarter"
## [30] "Precipitation of Driest Quarter"
## [31] "Precipitation of Warmest Quarter"
## [32] "Precipitation of Coldest Quarter"
dat %>%
filter(abs(Latitude) > 20) %>%
ggplot(aes(abs(Latitude), Total_insertions)) +
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
Color_Cluster + Shape_Cluster +
geom_smooth(color = "grey20", method = "lm", se =F) +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 650) +
stat_regline_equation(label.y = 610)
dat %>%
filter(abs(Latitude) > 20) %>%
ggplot(aes(abs(Latitude), Total_insertions)) +
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
geom_smooth(aes(color = Cluster), method = "lm", se =F) +
Color_Cluster + Shape_Cluster +
facet_wrap(vars(Cluster))+
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 160) +
stat_regline_equation(label.y = 100)
model_climate = lm(Total_insertions ~ Collection + Cluster + Latitude,
data=dat)
summary(model_climate)
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Latitude,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.333 -21.702 -1.209 18.948 241.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 250.8555 8.8272 28.418 < 2e-16 ***
## CollectionETHZ_2020 102.3821 6.5211 15.700 < 2e-16 ***
## CollectionHartmann_FstQst_2015 1.1524 6.3930 0.180 0.856990
## CollectionHartmann_Oregon_2016 180.9941 7.8124 23.168 < 2e-16 ***
## CollectionINRAE_BIOGER 67.4046 5.1370 13.121 < 2e-16 ***
## CollectionJGI 48.1781 7.9148 6.087 1.61e-09 ***
## CollectionJGI_2 39.5121 7.9700 4.958 8.32e-07 ***
## CollectionJGI_3 54.7530 9.5907 5.709 1.48e-08 ***
## CollectionJGI_4 82.5057 21.0920 3.912 9.75e-05 ***
## CollectionJGI_5 70.6836 7.4022 9.549 < 2e-16 ***
## CollectionMMcDonald_2018 171.3117 17.0885 10.025 < 2e-16 ***
## CollectionSyngenta 211.1625 4.3000 49.108 < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.7386 10.0172 -2.170 0.030221 *
## ClusterV1 -31.7350 9.6639 -3.284 0.001058 **
## ClusterV10 80.3299 6.5783 12.211 < 2e-16 ***
## ClusterV11 -62.8027 7.6789 -8.179 8.24e-16 ***
## ClusterV2 28.9759 5.6946 5.088 4.28e-07 ***
## ClusterV3 1.9224 10.1270 0.190 0.849480
## ClusterV4 96.5048 15.4650 6.240 6.33e-10 ***
## ClusterV5 38.5572 14.0331 2.748 0.006107 **
## ClusterV6 -48.9305 7.7032 -6.352 3.16e-10 ***
## ClusterV7 -39.0489 10.3165 -3.785 0.000162 ***
## ClusterV8 66.7578 9.8909 6.749 2.45e-11 ***
## ClusterV9 56.9904 14.8753 3.831 0.000135 ***
## Latitude 0.4609 0.1748 2.636 0.008505 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.83 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8707, Adjusted R-squared: 0.8678
## F-statistic: 294.2 on 24 and 1048 DF, p-value: < 2.2e-16
dat2 = dat %>%
pivot_longer(cols = -c(ID_file, Non_ref_insertions, Ref_insertions, Total_insertions,
Continent, Country, Sampling_Date, Collection, Cluster),
names_to = "Variable", values_to = "Estimate_variable")
short_list_bioclim = c("Annual Mean Temperature", "Mean Diurnal Range ", "Temperature Seasonality (standard deviation ×100)" ,
"Annual Precipitation", "Precipitation Seasonality (Coefficient of Variation)")
for (var_of_interest in Bioclim_var){
model_climate = lm(Total_insertions ~ Collection + Cluster + Estimate_variable,
data=filter(dat2, Variable == var_of_interest))
temp = summary(model_climate)
if (temp$coefficients['Estimate_variable',4] <= 0.05){
print(var_of_interest)
print(temp)
print(dat2 %>%
filter(Variable == var_of_interest) %>%
ggplot(aes(Estimate_variable, Total_insertions)) +
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
Color_Cluster + Shape_Cluster +
geom_smooth(color = "grey20", method = "lm", se =F) +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 150) +
stat_regline_equation(label.y = 110) +
labs( x = str_wrap(var_of_interest, 40)))
print(dat2 %>%
filter(Variable == var_of_interest) %>%
ggplot(aes(Estimate_variable, Total_insertions)) +
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
geom_smooth(aes(color = Cluster), method = "lm", se =F) +
Color_Cluster + Shape_Cluster +
facet_wrap(vars(Cluster))+
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 160) +
stat_regline_equation(label.y = 100) +
labs( x = str_wrap(var_of_interest, 40)))
}
}
## [1] "Annual Mean Temperature"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.892 -20.997 -1.225 19.077 242.636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 309.7700 9.9228 31.218 < 2e-16 ***
## CollectionETHZ_2020 108.8584 6.5810 16.541 < 2e-16 ***
## CollectionHartmann_FstQst_2015 1.4096 6.3204 0.223 0.823566
## CollectionHartmann_Oregon_2016 180.9727 7.7207 23.440 < 2e-16 ***
## CollectionINRAE_BIOGER 75.5250 5.2593 14.360 < 2e-16 ***
## CollectionJGI 51.6194 7.8460 6.579 7.46e-11 ***
## CollectionJGI_2 45.6106 7.9592 5.731 1.31e-08 ***
## CollectionJGI_3 60.9234 9.5308 6.392 2.46e-10 ***
## CollectionJGI_4 92.4521 20.8971 4.424 1.07e-05 ***
## CollectionJGI_5 79.0263 7.4937 10.546 < 2e-16 ***
## CollectionMMcDonald_2018 130.7912 10.0008 13.078 < 2e-16 ***
## CollectionSyngenta 216.0019 4.2720 50.562 < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.9028 9.8748 -2.218 0.026765 *
## ClusterV1 -25.3970 9.5933 -2.647 0.008234 **
## ClusterV10 83.9475 6.3368 13.248 < 2e-16 ***
## ClusterV11 -49.3068 7.7771 -6.340 3.41e-10 ***
## ClusterV2 24.2764 5.5203 4.398 1.21e-05 ***
## ClusterV3 2.6154 10.0136 0.261 0.794003
## ClusterV4 79.1163 9.6269 8.218 6.05e-16 ***
## ClusterV5 15.0175 7.8373 1.916 0.055617 .
## ClusterV6 -22.3153 9.0856 -2.456 0.014206 *
## ClusterV7 -20.0044 10.7381 -1.863 0.062752 .
## ClusterV8 40.7932 11.1182 3.669 0.000256 ***
## ClusterV9 32.0229 10.0119 3.198 0.001423 **
## Estimate_variable -3.6879 0.6603 -5.585 2.98e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.43 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8737, Adjusted R-squared: 0.8708
## F-statistic: 301.9 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Mean Diurnal Range "
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.195 -21.939 -0.956 19.106 237.323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 296.052 12.768 23.188 < 2e-16 ***
## CollectionETHZ_2020 103.876 6.614 15.706 < 2e-16 ***
## CollectionHartmann_FstQst_2015 3.145 6.429 0.489 0.62480
## CollectionHartmann_Oregon_2016 183.125 7.827 23.397 < 2e-16 ***
## CollectionINRAE_BIOGER 67.695 5.135 13.184 < 2e-16 ***
## CollectionJGI 48.938 7.917 6.181 9.08e-10 ***
## CollectionJGI_2 36.582 7.865 4.651 3.72e-06 ***
## CollectionJGI_3 54.278 9.597 5.656 2.00e-08 ***
## CollectionJGI_4 84.512 21.083 4.008 6.54e-05 ***
## CollectionJGI_5 70.881 7.406 9.571 < 2e-16 ***
## CollectionMMcDonald_2018 133.414 10.104 13.204 < 2e-16 ***
## CollectionSyngenta 212.296 4.273 49.684 < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.209 10.002 -2.121 0.03419 *
## ClusterV1 -27.558 9.728 -2.833 0.00470 **
## ClusterV10 87.593 6.535 13.404 < 2e-16 ***
## ClusterV11 -59.811 7.616 -7.853 1.00e-14 ***
## ClusterV2 29.146 5.672 5.139 3.30e-07 ***
## ClusterV3 4.468 10.181 0.439 0.66087
## ClusterV4 70.290 9.645 7.288 6.19e-13 ***
## ClusterV5 6.967 7.827 0.890 0.37360
## ClusterV6 -49.810 7.692 -6.476 1.45e-10 ***
## ClusterV7 -33.822 10.486 -3.226 0.00130 **
## ClusterV8 78.113 9.618 8.122 1.28e-15 ***
## ClusterV9 35.744 10.502 3.404 0.00069 ***
## Estimate_variable -2.877 1.094 -2.630 0.00867 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.83 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8707, Adjusted R-squared: 0.8678
## F-statistic: 294.2 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Isothermality (BIO2/BIO7) (×100)"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.654 -21.634 -1.384 19.685 242.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.1170 9.8762 28.768 < 2e-16 ***
## CollectionETHZ_2020 101.7464 6.5097 15.630 < 2e-16 ***
## CollectionHartmann_FstQst_2015 4.0133 6.5037 0.617 0.537318
## CollectionHartmann_Oregon_2016 185.5921 7.9997 23.200 < 2e-16 ***
## CollectionINRAE_BIOGER 69.8230 5.2117 13.397 < 2e-16 ***
## CollectionJGI 49.4529 7.9342 6.233 6.63e-10 ***
## CollectionJGI_2 38.9554 7.9703 4.888 1.18e-06 ***
## CollectionJGI_3 55.2100 9.5960 5.753 1.15e-08 ***
## CollectionJGI_4 85.9543 21.1148 4.071 5.04e-05 ***
## CollectionJGI_5 72.4344 7.4872 9.674 < 2e-16 ***
## CollectionMMcDonald_2018 140.8752 10.4263 13.512 < 2e-16 ***
## CollectionSyngenta 214.1764 4.3435 49.310 < 2e-16 ***
## CollectionUnine_third_batch_2018 -20.3452 9.9945 -2.036 0.042038 *
## ClusterV1 -30.2209 9.6659 -3.127 0.001817 **
## ClusterV10 83.5069 6.4224 13.002 < 2e-16 ***
## ClusterV11 -60.6150 7.6235 -7.951 4.76e-15 ***
## ClusterV2 32.2845 5.3677 6.015 2.49e-09 ***
## ClusterV3 1.8548 10.1354 0.183 0.854831
## ClusterV4 65.0799 9.3647 6.950 6.43e-12 ***
## ClusterV5 9.6724 7.8675 1.229 0.219191
## ClusterV6 -48.3635 7.7344 -6.253 5.85e-10 ***
## ClusterV7 -41.6691 10.4033 -4.005 6.63e-05 ***
## ClusterV8 67.7641 9.8979 6.846 1.29e-11 ***
## ClusterV9 35.3606 10.5887 3.339 0.000869 ***
## Estimate_variable -0.4630 0.2035 -2.276 0.023071 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.86 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8705, Adjusted R-squared: 0.8676
## F-statistic: 293.6 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Max Temperature of Warmest Month"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.961 -21.516 -1.078 18.673 235.270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 326.4286 16.1775 20.178 < 2e-16 ***
## CollectionETHZ_2020 106.4341 6.6256 16.064 < 2e-16 ***
## CollectionHartmann_FstQst_2015 0.2957 6.3709 0.046 0.9630
## CollectionHartmann_Oregon_2016 178.8620 7.8069 22.911 < 2e-16 ***
## CollectionINRAE_BIOGER 69.9151 5.1387 13.606 < 2e-16 ***
## CollectionJGI 49.7920 7.8883 6.312 4.06e-10 ***
## CollectionJGI_2 37.2500 7.8350 4.754 2.27e-06 ***
## CollectionJGI_3 58.9397 9.5879 6.147 1.12e-09 ***
## CollectionJGI_4 87.4284 21.0088 4.162 3.42e-05 ***
## CollectionJGI_5 72.3892 7.3913 9.794 < 2e-16 ***
## CollectionMMcDonald_2018 120.9225 10.6396 11.365 < 2e-16 ***
## CollectionSyngenta 212.1185 4.2551 49.850 < 2e-16 ***
## CollectionUnine_third_batch_2018 -20.6013 9.9395 -2.073 0.0384 *
## ClusterV1 -24.4337 9.7408 -2.508 0.0123 *
## ClusterV10 88.2744 6.4612 13.662 < 2e-16 ***
## ClusterV11 -53.2513 7.7789 -6.846 1.29e-11 ***
## ClusterV2 25.7476 5.6958 4.520 6.87e-06 ***
## ClusterV3 4.0718 10.1010 0.403 0.6870
## ClusterV4 74.5656 9.6724 7.709 2.93e-14 ***
## ClusterV5 6.8292 7.7903 0.877 0.3809
## ClusterV6 -40.4657 8.0277 -5.041 5.46e-07 ***
## ClusterV7 -18.2214 11.4862 -1.586 0.1130
## ClusterV8 69.7670 9.5148 7.332 4.51e-13 ***
## ClusterV9 25.8505 10.0770 2.565 0.0104 *
## Estimate_variable -2.2004 0.5506 -3.996 6.88e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.68 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8718, Adjusted R-squared: 0.8689
## F-statistic: 297.1 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Min Temperature of Coldest Month"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.583 -21.982 -1.591 20.106 244.394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 264.5865 6.4071 41.296 < 2e-16 ***
## CollectionETHZ_2020 101.7783 6.4808 15.705 < 2e-16 ***
## CollectionHartmann_FstQst_2015 4.0480 6.4391 0.629 0.52971
## CollectionHartmann_Oregon_2016 186.5699 7.9457 23.480 < 2e-16 ***
## CollectionINRAE_BIOGER 73.5067 5.4278 13.543 < 2e-16 ***
## CollectionJGI 49.0130 7.9049 6.200 8.09e-10 ***
## CollectionJGI_2 41.3326 8.0235 5.151 3.09e-07 ***
## CollectionJGI_3 57.6884 9.6005 6.009 2.57e-09 ***
## CollectionJGI_4 89.9914 21.1309 4.259 2.24e-05 ***
## CollectionJGI_5 74.7044 7.5399 9.908 < 2e-16 ***
## CollectionMMcDonald_2018 139.2740 10.1638 13.703 < 2e-16 ***
## CollectionSyngenta 215.9113 4.4038 49.029 < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.0476 9.9762 -2.110 0.03511 *
## ClusterV1 -29.1681 9.6531 -3.022 0.00258 **
## ClusterV10 82.7477 6.4167 12.896 < 2e-16 ***
## ClusterV11 -54.5472 7.8086 -6.986 5.03e-12 ***
## ClusterV2 33.3031 5.2612 6.330 3.63e-10 ***
## ClusterV3 0.7259 10.1164 0.072 0.94281
## ClusterV4 67.3669 9.3906 7.174 1.38e-12 ***
## ClusterV5 14.5665 8.0902 1.801 0.07207 .
## ClusterV6 -39.2917 8.3981 -4.679 3.27e-06 ***
## ClusterV7 -37.0792 10.3142 -3.595 0.00034 ***
## ClusterV8 48.6452 12.4076 3.921 9.41e-05 ***
## ClusterV9 33.1502 10.2071 3.248 0.00120 **
## Estimate_variable -1.3618 0.4287 -3.176 0.00154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.78 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8711, Adjusted R-squared: 0.8682
## F-statistic: 295.2 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Mean Temperature of Driest Quarter"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -144.556 -21.372 -1.701 20.106 245.387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 274.7080 6.6978 41.015 < 2e-16 ***
## CollectionETHZ_2020 102.5593 6.4834 15.819 < 2e-16 ***
## CollectionHartmann_FstQst_2015 5.7385 6.4849 0.885 0.376419
## CollectionHartmann_Oregon_2016 189.0828 8.0420 23.512 < 2e-16 ***
## CollectionINRAE_BIOGER 74.9882 5.4789 13.687 < 2e-16 ***
## CollectionJGI 48.3689 7.8904 6.130 1.24e-09 ***
## CollectionJGI_2 40.0543 7.9141 5.061 4.92e-07 ***
## CollectionJGI_3 62.5016 9.7539 6.408 2.23e-10 ***
## CollectionJGI_4 92.7866 21.1503 4.387 1.27e-05 ***
## CollectionJGI_5 74.6322 7.4874 9.968 < 2e-16 ***
## CollectionMMcDonald_2018 137.2590 10.0764 13.622 < 2e-16 ***
## CollectionSyngenta 217.3042 4.4623 48.698 < 2e-16 ***
## CollectionUnine_third_batch_2018 -19.5326 9.9470 -1.964 0.049833 *
## ClusterV1 -28.1976 9.6503 -2.922 0.003553 **
## ClusterV10 83.1448 6.3966 12.998 < 2e-16 ***
## ClusterV11 -52.9067 7.8489 -6.741 2.60e-11 ***
## ClusterV2 28.4470 5.5132 5.160 2.96e-07 ***
## ClusterV3 0.4401 10.1020 0.044 0.965257
## ClusterV4 69.5691 9.4397 7.370 3.46e-13 ***
## ClusterV5 5.5546 7.8207 0.710 0.477710
## ClusterV6 -41.6432 8.0078 -5.200 2.39e-07 ***
## ClusterV7 -27.3351 10.7490 -2.543 0.011132 *
## ClusterV8 56.6657 10.5947 5.348 1.09e-07 ***
## ClusterV9 33.6045 10.1797 3.301 0.000995 ***
## Estimate_variable -0.7754 0.2121 -3.657 0.000268 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.72 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8715, Adjusted R-squared: 0.8686
## F-statistic: 296.2 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Mean Temperature of Warmest Quarter"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.481 -21.593 -1.068 18.377 237.588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 328.1999 14.7150 22.304 < 2e-16 ***
## CollectionETHZ_2020 107.5958 6.6214 16.250 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -2.6261 6.4081 -0.410 0.68203
## CollectionHartmann_Oregon_2016 174.2193 7.9232 21.988 < 2e-16 ***
## CollectionINRAE_BIOGER 69.5798 5.1137 13.606 < 2e-16 ***
## CollectionJGI 50.7252 7.8772 6.439 1.82e-10 ***
## CollectionJGI_2 38.8089 7.8326 4.955 8.44e-07 ***
## CollectionJGI_3 60.0291 9.5761 6.269 5.31e-10 ***
## CollectionJGI_4 87.2215 20.9515 4.163 3.40e-05 ***
## CollectionJGI_5 73.4438 7.3869 9.942 < 2e-16 ***
## CollectionMMcDonald_2018 118.5501 10.6319 11.150 < 2e-16 ***
## CollectionSyngenta 211.8242 4.2459 49.889 < 2e-16 ***
## CollectionUnine_third_batch_2018 -19.6465 9.9104 -1.982 0.04769 *
## ClusterV1 -25.7297 9.6508 -2.666 0.00779 **
## ClusterV10 86.1769 6.3798 13.508 < 2e-16 ***
## ClusterV11 -51.8024 7.7796 -6.659 4.45e-11 ***
## ClusterV2 25.2824 5.6069 4.509 7.24e-06 ***
## ClusterV3 3.4890 10.0661 0.347 0.72896
## ClusterV4 77.2179 9.7139 7.949 4.83e-15 ***
## ClusterV5 6.8008 7.7702 0.875 0.38165
## ClusterV6 -32.1168 8.5763 -3.745 0.00019 ***
## ClusterV7 -14.3287 11.5361 -1.242 0.21449
## ClusterV8 65.4204 9.6151 6.804 1.71e-11 ***
## ClusterV9 21.1809 10.1499 2.087 0.03715 *
## Estimate_variable -3.0424 0.6599 -4.610 4.52e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.59 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.8696
## F-statistic: 298.8 on 24 and 1048 DF, p-value: < 2.2e-16
## [1] "Mean Temperature of Coldest Quarter"
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable,
## data = filter(dat2, Variable == var_of_interest))
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.14 -21.13 -1.37 20.01 244.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 271.9332 6.4656 42.058 < 2e-16 ***
## CollectionETHZ_2020 104.0141 6.5117 15.973 < 2e-16 ***
## CollectionHartmann_FstQst_2015 4.5120 6.4136 0.704 0.481900
## CollectionHartmann_Oregon_2016 186.6368 7.8712 23.711 < 2e-16 ***
## CollectionINRAE_BIOGER 74.7352 5.3933 13.857 < 2e-16 ***
## CollectionJGI 50.4771 7.8971 6.392 2.46e-10 ***
## CollectionJGI_2 43.8724 8.0667 5.439 6.68e-08 ***
## CollectionJGI_3 58.6113 9.5799 6.118 1.34e-09 ***
## CollectionJGI_4 91.6710 21.0750 4.350 1.50e-05 ***
## CollectionJGI_5 76.7822 7.5629 10.152 < 2e-16 ***
## CollectionMMcDonald_2018 140.2431 10.1302 13.844 < 2e-16 ***
## CollectionSyngenta 216.6828 4.3836 49.430 < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.2005 9.9446 -2.132 0.033251 *
## ClusterV1 -28.1481 9.6348 -2.921 0.003558 **
## ClusterV10 82.7662 6.3920 12.948 < 2e-16 ***
## ClusterV11 -53.4135 7.7683 -6.876 1.06e-11 ***
## ClusterV2 30.7678 5.3213 5.782 9.73e-09 ***
## ClusterV3 1.3566 10.0835 0.135 0.893007
## ClusterV4 70.4716 9.4431 7.463 1.78e-13 ***
## ClusterV5 15.5652 8.0208 1.941 0.052574 .
## ClusterV6 -35.3874 8.4900 -4.168 3.32e-05 ***
## ClusterV7 -35.4433 10.3048 -3.439 0.000606 ***
## ClusterV8 45.5873 11.8243 3.855 0.000123 ***
## ClusterV9 35.8864 10.2399 3.505 0.000477 ***
## Estimate_variable -1.7137 0.4274 -4.010 6.50e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.68 on 1048 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8719, Adjusted R-squared: 0.8689
## F-statistic: 297.1 on 24 and 1048 DF, p-value: < 2.2e-16
I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.
#while read p; do fichier_list=$(ls -1 /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/${p}\.*txt) ; for fichier in $fichier_list; do short_name=$(echo $fichier | cut -f 8 -d "/" | cut -f 2 -d "." ) ; comp=$(grep Composite $fichier) ; echo $p $short_name $comp ; done; done < Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
col_names = c("ID_file", "TE", "Length",
"# mapped read-segments", "# unmapped read-segments")) %>%
filter(TE != "*") %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
left_join(Zt_meta %>% dplyr::select(ID_file, Collection, Country, Continent)) %>%
unite(Continent, Country, ID_file, col = "for_display", remove = F)
temp = reads_per_TE %>% group_by(ID_file) %>%
dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))
reads_per_TE = left_join(reads_per_TE, temp) %>%
dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)
RIP=read_delim(paste0(RIP_DIR, "Composite_index.txt"),
col_names = c("ID_file", "TE", "Variable", "Composite_median", "Composite_mean" ),
delim = " ", na = c("nan", "NA", "")) %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
left_join(Zt_meta %>% dplyr::select(Continent, Cluster, Country, Collection, ID_file)) %>%
unite(Continent, Country, ID_file, col = "for_display", remove = F) %>%
left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!
#Per cluster
world_RIP_avg <-
RIP %>%
filter(TE == "RIP_est") %>%
group_by(Cluster) %>%
dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
ungroup() %>%
dplyr::summarize(avg = mean(as.numeric(avg), na.rm = T)) %>%
pull(avg)
ordering_table = tibble(Cluster = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
Order_tree = c(6,9,8,7,5,2,1,10,4,11,3))
temp = RIP %>%
filter(TE == "RIP_est") %>%
filter(!is.na(Cluster)) %>%
group_by(Cluster) %>%
dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
left_join(., ordering_table) %>%
arrange(Order_tree)
RIP_plot = ggplot(temp, aes(x = reorder(Cluster, -Order_tree),
y = as.numeric(Composite_median),
color = Cluster)) +
coord_flip() +
geom_segment(aes(x = reorder(Cluster, -region_avg), xend = reorder(Cluster, -region_avg),
y = world_RIP_avg, yend = region_avg), size = 0.8) +
geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
stat_summary(fun = mean, geom = "point", size = 5) +
Color_Cluster +
theme_light() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = "RIP composite index",
title = "RIP levels per Cluster",
subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
"are high in the Middle-East",
"and low in North America in particular."), width = 70))
#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Cluster + Collection ,
data=temp)
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: as.numeric(Composite_median)
## Sum Sq Df F value Pr(>F)
## Cluster 19.7728 10 320.237 < 2.2e-16 ***
## Collection 6.5833 13 82.016 < 2.2e-16 ***
## Residuals 5.9521 964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
marginal = lsmeans(model, ~ Cluster)
pairs(marginal, adjust="sidak")
## contrast estimate SE df t.ratio p.value
## V1 - V10 0.133632 0.0553 964 2.417 0.5842
## V1 - V11 -0.491634 0.0573 964 -8.584 <.0001
## V1 - V2 -0.084395 0.0557 964 -1.516 0.9995
## V1 - V3 0.003394 0.0188 964 0.181 1.0000
## V1 - V4 0.091778 0.0524 964 1.752 0.9899
## V1 - V5 -0.083909 0.0574 964 -1.463 0.9998
## V1 - V6 -0.556971 0.0560 964 -9.942 <.0001
## V1 - V7 -0.375493 0.0595 964 -6.312 <.0001
## V1 - V8 0.058906 0.0585 964 1.007 1.0000
## V1 - V9 0.034169 0.0590 964 0.579 1.0000
## V10 - V11 -0.625266 0.0180 964 -34.793 <.0001
## V10 - V2 -0.218027 0.0133 964 -16.342 <.0001
## V10 - V3 -0.130238 0.0584 964 -2.230 0.7647
## V10 - V4 -0.041854 0.0177 964 -2.368 0.6334
## V10 - V5 -0.217541 0.0179 964 -12.167 <.0001
## V10 - V6 -0.690603 0.0151 964 -45.589 <.0001
## V10 - V7 -0.509125 0.0241 964 -21.137 <.0001
## V10 - V8 -0.074726 0.0215 964 -3.479 0.0285
## V10 - V9 -0.099463 0.0227 964 -4.389 0.0007
## V11 - V2 0.407239 0.0168 964 24.193 <.0001
## V11 - V3 0.495028 0.0603 964 8.212 <.0001
## V11 - V4 0.583412 0.0231 964 25.202 <.0001
## V11 - V5 0.407725 0.0203 964 20.069 <.0001
## V11 - V6 -0.065337 0.0202 964 -3.236 0.0666
## V11 - V7 0.116141 0.0263 964 4.412 0.0006
## V11 - V8 0.550541 0.0239 964 23.033 <.0001
## V11 - V9 0.525803 0.0247 964 21.270 <.0001
## V2 - V3 0.087789 0.0588 964 1.494 0.9997
## V2 - V4 0.176173 0.0189 964 9.345 <.0001
## V2 - V5 0.000486 0.0181 964 0.027 1.0000
## V2 - V6 -0.472576 0.0164 964 -28.839 <.0001
## V2 - V7 -0.291098 0.0236 964 -12.354 <.0001
## V2 - V8 0.143301 0.0216 964 6.648 <.0001
## V2 - V9 0.118564 0.0229 964 5.180 <.0001
## V3 - V4 0.088384 0.0557 964 1.588 0.9986
## V3 - V5 -0.087303 0.0604 964 -1.446 0.9999
## V3 - V6 -0.560366 0.0591 964 -9.482 <.0001
## V3 - V7 -0.378887 0.0624 964 -6.073 <.0001
## V3 - V8 0.055512 0.0615 964 0.903 1.0000
## V3 - V9 0.030774 0.0619 964 0.497 1.0000
## V4 - V5 -0.175687 0.0234 964 -7.524 <.0001
## V4 - V6 -0.648749 0.0199 964 -32.663 <.0001
## V4 - V7 -0.467271 0.0282 964 -16.578 <.0001
## V4 - V8 -0.032871 0.0261 964 -1.261 1.0000
## V4 - V9 -0.057609 0.0272 964 -2.119 0.8537
## V5 - V6 -0.473062 0.0201 964 -23.547 <.0001
## V5 - V7 -0.291584 0.0261 964 -11.152 <.0001
## V5 - V8 0.142815 0.0237 964 6.026 <.0001
## V5 - V9 0.118078 0.0243 964 4.850 0.0001
## V6 - V7 0.181478 0.0259 964 7.012 <.0001
## V6 - V8 0.615878 0.0236 964 26.055 <.0001
## V6 - V9 0.591140 0.0244 964 24.220 <.0001
## V7 - V8 0.434400 0.0283 964 15.371 <.0001
## V7 - V9 0.409662 0.0296 964 13.853 <.0001
## V8 - V9 -0.024738 0.0275 964 -0.900 1.0000
##
## Results are averaged over the levels of: Collection
## Note: contrasts are still on the as.numeric scale
## P value adjustment: sidak method for 55 tests
CLD_RIP = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "sidak") ### sidak-adjusted p-values
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)
text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))
RIP_plot +
geom_text(data = CLD_RIP, aes(x = Cluster,
y = text_y,
label = .group), color = "black")
The RIP index does seem consistent with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).
As this pattern matches with the pattern of TE per cluster, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected.
TE_RIP = inner_join(TE_qty, RIP %>% filter(TE == "RIP_est") %>% dplyr::select(-Cluster)) %>%
inner_join(read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
dplyr::filter(Total_reads > Te_aligned_reads) %>%
dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads))
TE_RIP$Sampling_Date[is.na(TE_RIP$Sampling_Date)] <- "Unknown"
# RIP vs TIPS
p1 = TE_RIP %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median),
color = Cluster, shape = Cluster))+
theme_light() +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
labs(color = "Genetic cluster",
x = "TE insertion number", y = "RIP composite median",
subtitle = "Without the OG Hartmann collection")
p2 = TE_RIP %>%
ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median),
color = Cluster, shape = Cluster))+
theme_light() +
geom_point() +
Color_Cluster3 + Shape_Cluster2 +
labs(color = "Genetic cluster",
x = "TE insertion number", y = "RIP composite median",
subtitle = "With the OG Hartmann collection")
ligne = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))
title <- ggdraw() +
draw_label("TE quantity vs RIP levels", fontface = 'bold',
x = 0, hjust = 0) + theme_void() + theme(plot.margin = margin(0, 0, 0, 55))
cowplot::plot_grid(title, ligne, get_legend(p1 + theme(legend.position = "bottom")),
ncol = 1, rel_heights = c(0.1, 1, 0.2), align = "hv")
TE_RIP %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
filter(Cluster %in% c("V1", "V10", "V2", "V3", "V4", "V5", "V8", "V9")) %>%
ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median)))+
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
Color_Cluster3 + Shape_Cluster2 +
labs(color = "Genetic cluster",
x = "TE insertion number", y = "RIP composite median",
subtitle = "Without the OG Hartmann collection") +
geom_smooth(color = "grey20", linetype = "dashed", method = "lm", se =F)+
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
stat_regline_equation(label.y = 1.07)
TE_RIP %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
filter(Cluster == "V2") %>%
ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median)))+
theme_light() +
geom_point(aes(color = Cluster, shape = Cluster)) +
Color_Cluster3 + Shape_Cluster2 +
labs(color = "Genetic cluster",
x = "TE insertion number", y = "RIP composite median",
subtitle = "Without the OG Hartmann collection") +
geom_smooth(color = "grey20", linetype = "dashed", method = "lm", se =F)+
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
stat_regline_equation(label.y = 1.07)
If the RIP machinery is not functional in some populations, we would expect a bimodal distribution of RIP: older TEs would show RIP signatures, while recent insertions would be free or almost free from RIP signatures. The populations in which RIP is still active should instead only contain ripped TEs.
percent_low_rip = read_delim(paste0(TE_RIP_dir, "Summary_RIP_non_RIP_reads.txt"), delim = " ") %>%
mutate(Percent = 100 * Non_ripped_count / (Non_ripped_count + Ripped_count)) %>%
inner_join(TE_qty, by = c("Sample" = "ID_file")) %>%
filter(Continent != "Asia" & !is.na(Continent)) %>%
filter(!is.na(Cluster))
percent_low_rip %>%
filter(Cluster %in% c("V11", "V6", "V7")) %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
dplyr::summarize(average = mean(Percent), min(Percent), max(Percent))
## # A tibble: 1 × 3
## average `min(Percent)` `max(Percent)`
## <dbl> <dbl> <dbl>
## 1 7.48 5.18 9.99
percent_low_rip %>%
filter(Cluster %in% c("V1", "V2", "V3","V4", "V5","V8", "V9", "V10")) %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
filter(Cluster != "Hybrid")%>%
dplyr::summarize(average = mean(Percent), min(Percent), max(Percent))
## # A tibble: 1 × 3
## average `min(Percent)` `max(Percent)`
## <dbl> <dbl> <dbl>
## 1 20.6 11.3 34.6
median_percent = percent_low_rip %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
filter(Cluster != "Hybrid")%>%
group_by(Cluster) %>%
dplyr::summarise(Med_percent_rip = median(Percent)) %>%
arrange(Med_percent_rip)
median_percent = median_percent %>%
mutate(Cluster = factor(Cluster, levels = median_percent$Cluster))
percent_low_rip %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
filter(Cluster != "Hybrid") %>%
mutate(Cluster = factor(Cluster, levels = median_percent$Cluster)) %>%
ggplot(aes(reorder(Sample, Percent), Percent, fill = Cluster)) +
geom_bar(stat = "identity") +
Fill_Cluster3 +
theme_light() +
geom_hline(data = median_percent, aes(yintercept = Med_percent_rip)) +
facet_grid(.~ Cluster, scales = "free_x", space = "free_x") +
theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(),
axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.spacing.x=unit(0, "lines"),
legend.position = "bottom") +
labs(x = "Isolates sorted by cluster and percentage of non-ripped TE-reads",
y = "Percentage of non-ripped TE-reads",
title = "Without Hartmann collection")
#read_delim(paste0(TE_RIP_dir, "Summary_RIP_non_RIP_reads.txt"), delim = " ") %>%
# mutate(Percent = 100 * Non_ripped_count / (Non_ripped_count + Ripped_count)) %>%
# inner_join(TE_qty, by = c("Sample" = "ID_file")) %>%
# dplyr::count(Cluster, Continent) %>%
# pivot_wider(names_from = Continent, values_from = n, values_fill = 0)
percent_low_rip %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(x = Total_insertions, y = Percent, color = Continent)) +
geom_point(alpha = .4) +
geom_smooth(method = "lm") +
theme_light() +
Color_Continent + Fill_Continent +
facet_wrap(vars(Continent), scale = "free") +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 35, size = 3) +
stat_regline_equation(label.y = 38, aes(label = ..eq.label..), size = 3) +
labs(y = "Percentage of non-ripped TE reads",
subtitle = "Without the Hartmann collection")
p1 = percent_low_rip %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(x = Total_insertions, y = Percent, color = Cluster, shape = Cluster)) +
geom_point(alpha = .9) +
theme_light() +
Shape_Cluster2 + Color_Cluster3 +
labs(y = "Percentage of non-ripped TE reads",title = "Without Hartmann collection")
p2 = percent_low_rip %>%
ggplot(aes(x = Total_insertions, y = Percent, color = Cluster, shape = Cluster)) +
geom_point(alpha = .9) +
theme_light() +
Shape_Cluster2 + Color_Cluster3 +
labs(y = "Percentage of non-ripped TE reads",title = "All collections")
p1
p2
model1 = lm(Total_insertions ~ Collection + Cluster + Percent,
data=percent_low_rip)
summary(model1)
##
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Percent,
## data = percent_low_rip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.711 -20.572 -1.342 18.784 244.128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 234.6468 8.5120 27.566 < 2e-16 ***
## CollectionETHZ_2020 105.1234 6.4423 16.318 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -17.6457 7.1615 -2.464 0.013900 *
## CollectionHartmann_Oregon_2016 179.9868 7.7116 23.340 < 2e-16 ***
## CollectionINRAE_BIOGER 70.6916 5.0982 13.866 < 2e-16 ***
## CollectionJGI 61.8108 7.9987 7.728 2.56e-14 ***
## CollectionJGI_2 44.8284 7.9198 5.660 1.95e-08 ***
## CollectionJGI_3 61.9321 9.5411 6.491 1.31e-10 ***
## CollectionJGI_4 92.0781 20.8784 4.410 1.14e-05 ***
## CollectionJGI_5 76.8673 7.4087 10.375 < 2e-16 ***
## CollectionMMcDonald_2018 133.1208 9.9820 13.336 < 2e-16 ***
## CollectionSyngenta 208.3454 4.2842 48.631 < 2e-16 ***
## CollectionUnine_third_batch_2018 -15.5216 10.1339 -1.532 0.125912
## ClusterV1 -35.2236 9.5748 -3.679 0.000246 ***
## ClusterV10 62.4303 7.4387 8.393 < 2e-16 ***
## ClusterV11 -47.6819 7.8519 -6.073 1.76e-09 ***
## ClusterV2 28.9798 5.3119 5.456 6.09e-08 ***
## ClusterV3 1.0373 10.0067 0.104 0.917457
## ClusterV4 41.0599 9.9086 4.144 3.69e-05 ***
## ClusterV5 1.6088 7.8200 0.206 0.837042
## ClusterV6 -33.0717 8.1793 -4.043 5.66e-05 ***
## ClusterV7 -29.0594 10.3378 -2.811 0.005031 **
## ClusterV8 51.8314 10.1741 5.094 4.15e-07 ***
## ClusterV9 7.1309 10.6963 0.667 0.505126
## Percent 2.0034 0.3565 5.619 2.46e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.4 on 1049 degrees of freedom
## (30 observations deleted due to missingness)
## Multiple R-squared: 0.8735, Adjusted R-squared: 0.8706
## F-statistic: 301.7 on 24 and 1049 DF, p-value: < 2.2e-16
With the Illumina data, we can look at each read but not necessarily at each copy. For this, I will look at the PacBio assemblies and the TE copies prediction and RIP estimation from Lorrain et al. 2021.
Lorrain_RIP = read_tsv(paste0(TE_RIP_dir, "Lorrain_MeanMedianCompositeValue_RIP_TEcopies.txt")) %>%
full_join(read_delim(paste0(metadata_dir, "PacBio_metadata.csv"),
delim = ","))
TE_nb = Lorrain_RIP %>% dplyr::count(Sample, name = "TE_count") %>% arrange(TE_count)
Lorrain_RIP %>%
left_join(TE_nb) %>%
filter(Species == "Z. tritici") %>%
group_by(Sample) %>%
dplyr::mutate(median_RIP = median(median)) %>%
ggplot(aes(y = reorder(Sample, median_RIP), x = median)) +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_density_ridges(aes(fill = Continent), alpha = .7) +
theme_light() +
coord_cartesian(xlim = c(-2.5, 5)) +
Fill_Continent +
labs(x = "RIP composite per TE copy", y = "Isolate",
title = "Distribution of RIP in TE copies of fully assembled genomes")
It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.
#Per TE superfamily
RIP %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily)%>%
mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
ggplot(aes(x = Superfamily,
y = as.numeric(Composite_median),
fill = median_Superfamily)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
ylab("Median of composite index on TE reads") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")
temp = RIP %>%
filter(Continent != "Asia") %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily, Continent)%>%
dplyr::mutate(median_Superfamily=median(Composite_median, na.rm = T),
for_alpha = median(Normalized_nb_reads_mapped))
p1 = filter(temp, Order == "Class I (retrotransposons)") %>%
ggplot(aes(x = Superfamily,
y = as.numeric(Composite_median),
color = Continent,
fill = Continent, alpha = for_alpha)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
labs(y ="RIP composite index",
title = "RIP on TE reads in the different continents") +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy") +
Color_Continent + Fill_Continent
p2 = filter(temp, Order == "Class II (DNA transposons)")%>%
ggplot(aes(x = Superfamily,
y = as.numeric(Composite_median),
color = Continent,
fill = Continent, alpha = for_alpha)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
labs(y ="RIP composite index") +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy") +
Color_Continent+ Fill_Continent
cowplot::plot_grid(cowplot::plot_grid(p1 + theme(legend.position = "None", axis.title.x = element_blank()),
p2 + theme(legend.position = "None"),
ncol = 1, rel_heights = c(1, 1)),
get_legend(p1), nrow = 1, rel_widths = c(1, 0.3))
In other species, RIP has been shown to have a size limit under which it is not capable of recognizing the repeated sequences, so I check here whether we recognize a similar pattern and whether some of the shortest TEs are escaping RIP.
#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_delim(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
col_names = c("TE", "length", "offset",
"number of bases per line", "number of bytes per line"), delim = "\t")
p = inner_join(RIP, TE_consensus_faidx) %>%
dplyr::group_by(Order, TE, length) %>%
filter (TE != "RIP_est") %>%
filter(!is.na(Normalized_nb_reads_mapped)) %>%
dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
norm_read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
ggplot(aes(x = length, y = median_per_consensus, color = Order)) +
geom_point(aes( text = TE, alpha = norm_read_mapped)) + theme_light() +
geom_hline(yintercept = 0) +
labs(x = "TE consensus length (bp, log10 scale)",
y = "Median of RIP composite index",
title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
"against the length of the consensus sequence"), width = 60)) +
scale_x_continuous(trans = "log10") +
scale_alpha_continuous(trans = "log10")
ggsave("RIP_per_consensus.pdf", plot = p)
ggplotly(p)
Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.
#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf
#sbatch -p normal.168h --array=1-1195%50 Detect_gene_blast_array.sh /home/alice/WW_PopGen/Directories_new.sh /data2/alice/WW_project/0_Data/Zt10_dim2_from_MgDNMT_deRIP.fa /home/alice/WW_PopGen/Keep_lists_samples/Good_assemblies.args Illumina /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/
Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.
In order to identify the native dim2 copy in genomes, I look for several things:
#system(paste0("cat ", DIM2_DIR, "*.blast.tab > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))
length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2
dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
"mismatch", "gapopen", "qstart", "qend",
"sstart", "send", "evalue", "bitscore"))
#dim2_blast_results =
flank1 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank1_Zt10_unitig_006_0418") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
filter(length > threshold_length_flank1) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n(),
dim2_flank1 = dim2_flank1_Zt10_unitig_006_0418 )
flank2 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank2_Zt10_unitig_006_0416") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
filter(length > threshold_length_flank2) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n(),
dim2_flank2 = dim2_flank2_Zt10_unitig_006_0416)
#Some dim2 copies have insertions and thus are identified as several matches
# I merge together the matches closer than 1kb
#grep "Zt10_dim2_from_MgDNMT_deRIP" Overall_results_blast_dim2.txt | awk 'BEGIN {OFS = "\t"} {if ($11 < $12) print $1":"$4, $11, $12, $5, $6; else print $1":"$4, $12, $11, $5, $6}' > Overall_results_blast_dim2.bed
#sort -k1,1 -k2,2n Overall_results_blast_dim2.bed > Overall_results_blast_dim2.sorted.bed
#bedtools merge -i Overall_results_blast_dim2.sorted.bed -c 4,5 -o collapse -d 1000 -delim ":" > /Overall_results_blast_dim2.sorted.merged_1kb.bed
#And recreate the pident as the mean of the pident of the matches weigthed by the length of the fragments
dim2_blast_results = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.sorted.merged_1kb.bed"),
delim = "\t",
col_names = c("temp", "sstart", "send", "pident_temp", "length_temp")) %>%
separate(col = temp, into = c("sample", "sseqid"), sep = ":") %>%
separate_rows(pident_temp, length_temp, sep = ":", convert = TRUE) %>%
group_by(sample, sseqid, sstart, send) %>%
dplyr::summarize(pident = weighted.mean(pident_temp, length_temp),
length = sum(length_temp)) %>%
#Add the flanks information and find the middle coordinates for the 3 genes
full_join(., flank1, by = "sample") %>%
full_join(., flank2, by = "sample") %>%
dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
midcoord_flank1 = replace_na(midcoord_flank1, 0),
midcoord_flank2 = replace_na(midcoord_flank2, 0)) %>%
dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
midcoord_flank2 = as.numeric(midcoord_flank2)) %>%
dplyr::mutate(ave_coord = (sstart + send)/2) %>%
dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank2, midcoord_flank1)) %>%
dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank1, midcoord_flank2)) %>%
inner_join(Zt_meta, by = c("sample" = "ID_file")) %>%
#Keep only the samples with either one or zero match for each flanking gene
filter(nb_gene.x <= 1 & nb_gene.y <= 1)
#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
ifelse(is_same_contig == "Both",
ifelse(ave_coord >= min_fl &
ave_coord <= max_fl ,
"Distance_OK", "Not_between"),
ifelse(is_same_contig == "Flank1",
ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
"Distance_OK", "Too_far"),
ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
"Distance_OK", "Too_far"))))) %>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
ifelse(distance == "Distance_OK", ">1", "0")))
#dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%
# Table percentages
temp = ungroup(dim2_blast_results) %>% dplyr::count(is_same_contig, name = "Nb_blast") %>%
mutate(Percentage = round(100*Nb_blast/sum(Nb_blast), 1))
kable(temp)
| is_same_contig | Nb_blast | Percentage |
|---|---|---|
| Both | 387 | 3.4 |
| Flank1 | 501 | 4.4 |
| Flank2 | 306 | 2.7 |
| None | 10123 | 89.4 |
#The insertions in some copies are longer than 1kb and cause the native gene copy to be recognized as separate two blast matches. The stats don't make sense without taking this into consideration.
#temp = ungroup(dim2_blast_results) %>% dplyr::count(is_same_contig, sample, name = "Nb_blast") %>% pivot_wider(names_from = "is_same_contig", values_from = "Nb_blast")
#Pie chart and length density
nb_sample = length(unique(dim2_blast_results$sample))
p1 = dim2_blast_results %>%
ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
geom_density(alpha = 0.5) +
theme_light() +
labs(x = "Length (bp)",
y = "Density",
title = str_wrap(paste0("Blast matches against Zt10dim2 for ", nb_sample, " samples"),
width = 60),
fill = "Neighbouring flanking genes",
color = "Neighbouring flanking genes")+
scale_color_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
scale_fill_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3"))
##Pie
temp <- temp %>%
arrange(desc(is_same_contig)) %>%
mutate(prop = Nb_blast / sum(temp$Nb_blast) *100) %>%
mutate(ypos = cumsum(Nb_blast)- 0.5*Nb_blast )
p2 = temp %>%
ggplot(aes(x = "")) +
geom_bar(aes(fill = is_same_contig, y = Nb_blast), stat ="identity", width = 1) +
coord_polar("y", start = 0)+
theme_void()+
scale_color_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
scale_fill_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
geom_text_repel(aes(y = ypos, label = paste0(Percentage, "%")), color = "black", size=4)
ggdraw(p1) +
draw_plot(p2 + theme(legend.position = "None"), x = 0.08, y = 0.05, width = 0.25, height = 1.3)
Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene.
Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).
Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.
temp = dim2_blast_results %>%
group_by(sample) %>%
dplyr::summarize(n_matches = n(),
n_long_matches = sum(length > 1000))
sum_dim2_blast = dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
group_by(sample) %>%
dplyr::summarize(length_sum = sum(length),
pident_wm = weighted.mean(pident, length),
n_matches_on_native_contigs = n()) %>%
filter(length_sum < length_dim2 + 200) %>%
inner_join(., temp) %>%
inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))
sum_dim2_blast %>%
#filter(Collection == "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Cluster, shape = Cluster))+
geom_point() +
Shape_Cluster2 + Color_Cluster3 +
theme_light() + theme(legend.position = "none") +
labs(x = str_wrap("RIP composite index (median per isolate)",
width = 40),
y = str_wrap("Identity of the native match", width = 40),
title = "Identity with functional dim2 and RIP composite index of TEs",
subtitle = "All collections")
sum_dim2_blast %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Cluster, shape = Cluster))+
geom_point() +
Shape_Cluster2 + Color_Cluster3 +
theme_light() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 40),
y = str_wrap("Identity of the native match",
width = 40),
title = "Identity with functional dim2 and RIP composite index of TEs",
subtitle = "Without Hartmann collection")
And then as boxplots per continental region.
sum_dim2_blast %>%
filter(!is.na(Cluster)) %>%
group_by(Cluster) %>%
dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
ggplot(aes(reorder(Cluster, avg_per_cont), pident_wm)) +
geom_violin(aes(color = Cluster, fill = Cluster), alpha = .4) +
geom_boxplot(width = .1, aes(color = Cluster)) +
# geom_jitter(aes(col = Cluster), size = 0.8, alpha = 0.6, width = 0.2, height = 0.1) +
Fill_Cluster3 + Color_Cluster3 +
theme_light() +
theme(legend.position = "none",
axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
labs(x = NULL, y = "Identity of the native copy to Zt10dim2")
sum_dim2_blast %>%
filter(!is.na(Cluster)) %>%
group_by(Cluster) %>%
dplyr::mutate(avg_per_clust = mean(n_long_matches, na.rm = T)) %>%
ggplot(aes(x = reorder(Cluster, avg_per_clust), y = n_long_matches)) +
geom_violin(aes(color = Cluster, fill = Cluster), alpha = .4) +
geom_boxplot(width = .1, aes(color = Cluster)) +
Color_Cluster3 + Fill_Cluster3 + theme_light() +
labs(x = str_wrap("Cluster",width = 20),
y = str_wrap("Number of long blast matches (above 1kb)",
width = 50),
color = "Genetic cluster") +
theme(axis.title=element_text(size=10))
If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.
Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt")) %>%
mutate(isolate = case_when(isolate == "1A5" ~ "Zt1A5",
isolate == "1E4" ~ "Zt1E4",
isolate == "3D1" ~ "Zt3D1",
isolate == "3D7" ~ "Zt3D7",
TRUE ~ isolate))
Lorrain_RIP = read_tsv(paste0(TE_RIP_dir, "Lorrain_MeanMedianTECompositeValue_RIP_perStrain.txt")) %>%
full_join(read_delim(paste0(metadata_dir, "PacBio_metadata.csv"),
delim = ","), by = c("Strain" = "Sample"))
temp = Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
group_by(isolate, N_genes_cat) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes)
inner_join(temp, Lorrain_RIP, by = c("isolate" = "Strain")) %>%
filter(N_genes_cat != "9 +") %>%
group_by(Composite_median, isolate, Continent) %>%
dplyr::summarize(count = sum(count)) %>%
ggplot(aes(x = count, y = Composite_median, label = isolate, color = Continent)) +
geom_text() +
theme_light() + Color_Continent +
labs(x = "Number of genes found in several copies", y = "RIP composite median on TE",
title = "Copy number between 2 and 9")
inner_join(temp, Lorrain_RIP, by = c("isolate" = "Strain")) %>%
filter(N_genes_cat == "2") %>%
group_by(Composite_median, isolate, Continent) %>%
dplyr::summarize(count = sum(count)) %>%
ggplot(aes(x = count, y = Composite_median, label = isolate, color = Continent)) +
geom_text() +
theme_light() + Color_Continent +
labs(x = "Number of genes found with two copies", y = "RIP composite median on TE")